diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00728725..ed464f61 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,15 +13,17 @@
   - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
   - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
   - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+  - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu).
   - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu).
   - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu).
-* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/77_blackwell_mla.cu).
+* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/): both [forward](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) and [backward](./examples/77_blackwell_fmha/77_blackwell_fmha_bwd.cu) passes are supported.
 * A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture.
 * Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
 * Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
   - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
   - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
   - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler.
   - Support for [mixed-dtype grouped GEMM with groupwise scaling](./examples/69_hopper_mixed_dtype_grouped_gemm) for Hopper architecture.
   - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
   - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e6f298e..b54b8335 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -713,6 +713,7 @@ target_include_directories(
   CUTLASS
   SYSTEM INTERFACE
   $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
+  $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include/cccl>
   )
 
 install(
diff --git a/README.md b/README.md
index 433c375c..50fae016 100644
--- a/README.md
+++ b/README.md
@@ -50,15 +50,17 @@ architecture.
   - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
   - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
   - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+  - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu).
   - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu).
   - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu).
-* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/77_blackwell_mla.cu).
+* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/): both [forward](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) and [backward](./examples/77_blackwell_fmha/77_blackwell_fmha_bwd.cu) passes are supported.
 * A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture.
 * Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
 * Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
   - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
   - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
   - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler.
   - Support for [mixed-dtype grouped GEMM with groupwise scaling](./examples/69_hopper_mixed_dtype_grouped_gemm) for Hopper architecture.
   - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
   - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index c963d95e..025eb65f 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -34,7 +34,7 @@
   addressable memory, and then store it back into addressable memory.
 
   TileIterator is a core concept in CUTLASS that enables efficient loading and storing of data to
-  and from addressable memory. The PredicateTileIterator accepts a ThreadMap type, which defines
+  and from addressable memory. The PredicatedTileIterator accepts a ThreadMap type, which defines
   the mapping of threads to a "tile" in memory. This separation of concerns enables user-defined
   thread mappings to be specified. 
 
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index 1c21678f..5d4fe1a1 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -75,11 +75,11 @@
 #include "cutlass/util/reference/host/tensor_copy.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
 
 // Includes from examples directory
 #include "helper.h"
 #include "hopper_fp8_commandline.hpp"
-#include "reference/host/gemm_with_blockwise_scaling.h"
 
 using namespace cute;
 
@@ -123,7 +123,13 @@ using ArchTag             = cutlass::arch::Sm90;                            // T
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
 using TileShape           = Shape<_128,_128,_128>;                           // Threadblock-level tile size
 using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
-using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<>;
+
+using ScaleConfig = decltype(cutlass::detail::sm90_trivial_blockwise_scale_config(TileShape{}));
+
+using LayoutSFA             = decltype(ScaleConfig::deduce_layoutSFA());                     // Layout type for SFA matrix operand
+using LayoutSFB             = decltype(ScaleConfig::deduce_layoutSFB());                     // Layout type for SFB matrix operand
+
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum; 
 using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecializedCooperative;
 
 using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
@@ -143,8 +149,8 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
 
 using CollectiveMainloopWithBlockWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    ElementA, LayoutA, AlignmentA,
-    ElementB, LayoutB, AlignmentB,
+    ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
+    ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
     ElementAccumulator,
     TileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
@@ -190,20 +196,22 @@ StrideB stride_B;
 StrideC stride_C;
 StrideD stride_D;
 StrideAux stride_aux;
+LayoutSFA layout_SFA;
+LayoutSFB layout_SFB;
 uint64_t seed;
 
+using LayoutScalar = cutlass::layout::PackedVectorLayout;
 cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
 cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
 cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
 uint32_t mma_promotion_interval;
-cutlass::HostTensor<ElementBlockScale, LayoutA> blockscale_tensor_A;
-cutlass::HostTensor<ElementBlockScale, LayoutB> blockscale_tensor_B;
+cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_A;
+cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_B;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
 cutlass::HostTensor<ElementAux, LayoutAux> tensor_aux;
 cutlass::HostTensor<ElementAux, LayoutAux> tensor_ref_aux;
 
-using LayoutScalar = cutlass::layout::PackedVectorLayout;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_alpha;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_beta;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scale_A;
@@ -342,26 +350,25 @@ bool initialize_scale_tensor(
 /// Initialize operands to be used in the GEMM and reference GEMM
 void initialize(const Options<RasterOrderOptions> &options) {
 
-  // Find Block Scaling tensor shapes based on problem shape and TileShape
-  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
-  auto blockscale_m = cute::get<0>(blockscale_shape);
-  auto blockscale_n = cute::get<1>(blockscale_shape);
-  auto blockscale_k = cute::get<2>(blockscale_shape);
-
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
   stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
   stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
   stride_aux = stride_D;
 
+  // Layout SFA and SFB represent logically broadcasting data in CuTe.
+  // E.g., if Layout SFA has shape ((ScaleGranularityM, M / ScaleGranularityM), (ScaleGraunularityK, K / ScaleGranularityK))
+  // and strides ((0, 1), (0, M / ScaleGraunuarlityM)), then each collection of ScaleGranularityM x ScaleGranularityK
+  // indecies in the tensor map to the same offset.
 
+  layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
+  layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
 
   auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
   auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
   auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
-  auto blockscale_a_coord = cutlass::make_Coord(blockscale_m * options.l, blockscale_k);
-  auto blockscale_b_coord = cutlass::make_Coord(blockscale_k, blockscale_n * options.l);
+  auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
+  auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
 
   tensor_A.resize(a_coord);
   blockscale_tensor_A.resize(blockscale_a_coord);
@@ -465,7 +472,9 @@ typename Gemm::Arguments args_from_options(const Options<RasterOrderOptions> &op
      stride_B,
      mma_promotion_interval,
      blockscale_tensor_A.device_data(),
-     blockscale_tensor_B.device_data()
+     layout_SFA,
+     blockscale_tensor_B.device_data(),
+     layout_SFB
      },
     {
       {}, // epilogue.thread
@@ -519,12 +528,6 @@ bool verify(const Options<RasterOrderOptions> &options) {
   // Compute reference output
   //
 
-  // Block scaling tensors shapes based CTA Block (TileShape) and GEMM Problem shape
-  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_m = ceil_div(options.m, get<0>(TileShape{}));
-  auto blockscale_n = ceil_div(options.n, get<1>(TileShape{}));
-  auto blockscale_k = ceil_div(options.k, get<2>(TileShape{}));
-
   // Create instantiation for device reference gemm kernel
   auto A = cute::make_tensor(tensor_A.host_data(),
                              cute::make_layout(
@@ -557,28 +560,18 @@ bool verify(const Options<RasterOrderOptions> &options) {
                                 )
                               );
 
-  auto blockscale_A = cute::make_tensor(blockscale_tensor_A.host_data(),
-                                        cute::make_layout(
-                                          cute::make_shape(blockscale_m, blockscale_k, options.l),
-                                          cute::make_stride(1, blockscale_m, blockscale_m * blockscale_k)
-                                        )
-                                      );
-  auto blockscale_B = cute::make_tensor(blockscale_tensor_B.host_data(),
-                                        cute::make_layout(
-                                          cute::make_shape(blockscale_n, blockscale_k, options.l),
-                                          cute::make_stride(1, blockscale_n, blockscale_n * blockscale_k)
-                                        )
-                                      );
+  auto SFA = cute::make_tensor(blockscale_tensor_A.host_data(), layout_SFA);
+  auto SFB = cute::make_tensor(blockscale_tensor_B.host_data(), layout_SFB);
 
   using unused_t = decltype(D);
 
-  cutlass::reference::host::GettMainloopParams<ElementAccumulator,
-                                               decltype(A), decltype(B),
-                                               decltype(blockscale_A), decltype(blockscale_B),
-                                               TileShape> mainloop_params{
-                                               A, B,                         // Operand Tensors
-                                               blockscale_A, blockscale_B    // Blockwise scaling Tensors
-                                              };
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A),
+      decltype(SFA),
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
 
   cutlass::reference::host::GettEpilogueParams<
       ElementScalar,
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index b7cdb00a..096e56a6 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -75,11 +75,11 @@
 #include "cutlass/util/reference/host/tensor_copy.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
 
 // Includes from examples directory
 #include "helper.h"
 #include "hopper_fp8_commandline.hpp"
-#include "reference/host/gemm_with_groupwise_scaling.h"
 
 using namespace cute;
 
@@ -120,55 +120,30 @@ using ElementAccumulator  = float;                                          // E
 using ElementBlockScale   = float;                                          // Element type for blockscaling during accumulation
 using ElementCompute      = float;                                          // Element type for epilogue computation
 
-using TileShape_  = Shape<_128,_128,_128>;  // This one is just to make the compiler happy with verify()...
+using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
+using TileShape     = Shape<_128,_128,_128>;                        // Threadblock-level tile size
+using ClusterShape  = Shape<_1,_2,_1>;                              // Shape of the threadblocks in a cluster
 
-// ScaleGranularity{M,N}: number of {rows in A}/{columns in B} that share the same scaling factor
-// Given TileShape = Shape<_128,_128,_128>:
-//   ScaleGranularityM == 128 and ScaleGranularityN == 128 --> 2Dx2D (the shape of the scaling factor)
-//   ScaleGranularityM == 1   and ScaleGranularityN == 128 --> 1Dx2D scaling
-//   ScaleGranularityM == 128 and ScaleGranularityN == 1   --> 2Dx1D scaling
-//   ScaleGranularityM == 1   and ScaleGranularityN == 1   --> 1Dx1D scaling
-template <int ScaleGranularityM_, int ScaleGranularityN_>
-struct GroupScaleConfig {
-  using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
-  using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
-  using TileShape     = Shape<_128,_128,_128>;                        // Threadblock-level tile size
-  using ClusterShape  = Shape<_1,_2,_1>;                              // Shape of the threadblocks in a cluster
+constexpr int ScaleGranularityM = 1;
+constexpr int ScaleGranularityN = 128;
+constexpr int ScaleGranularityK = 128;
 
-  static constexpr int ScaleGranularityM = ScaleGranularityM_;
-  static constexpr int ScaleGranularityN = ScaleGranularityN_;
-  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
-  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
 
-  static_assert(size<0>(TileShape{}) == ScaleGranularityM * ScaleMsPerTile,
-              "FP8 scaling granularity must evenly divide tile shape along M.");
-  static_assert(size<1>(TileShape{}) == ScaleGranularityN * ScaleNsPerTile,
-              "FP8 scaling granularity must evenly divide tile shape along N.");
+using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
 
-  using KernelSchedule    = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>;
-  using EpilogueSchedule  = cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
-  using FusionOperation   = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+using LayoutSFA     = decltype(ScaleConfig::deduce_layoutSFA());    // Layout type for SFA matrix operand
+using LayoutSFB     = decltype(ScaleConfig::deduce_layoutSFB());    // Layout type for SFB matrix operand
+
+using KernelSchedule    = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum;
+using EpilogueSchedule  = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
+using FusionOperation   = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
     LayoutAux, cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementC>;
-};
 
-using GroupScale1D1DConfig = GroupScaleConfig<                    1,                     1>;
-using GroupScale1D2DConfig = GroupScaleConfig<                    1, size<1>(TileShape_{})>;
-using GroupScale2D1DConfig = GroupScaleConfig<size<0>(TileShape_{}),                     1>;
-using GroupScale2D2DConfig = GroupScaleConfig<size<0>(TileShape_{}), size<1>(TileShape_{})>;
-
-template <typename ScheduleConfig>
-struct GroupScaleGemm {
-  using ArchTag           = typename ScheduleConfig::ArchTag;
-  using OperatorClass     = typename ScheduleConfig::OperatorClass;
-  using TileShape         = typename ScheduleConfig::TileShape;
-  using ClusterShape      = typename ScheduleConfig::ClusterShape;
-  using KernelSchedule    = typename ScheduleConfig::KernelSchedule;
-  using EpilogueSchedule  = typename ScheduleConfig::EpilogueSchedule;
-  using EpilogueTileType  = typename ScheduleConfig::EpilogueTileType;
-  using FusionOperation   = typename ScheduleConfig::FusionOperation;
-
-  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
     TileShape, ClusterShape,
     EpilogueTileType,
@@ -179,10 +154,10 @@ struct GroupScaleGemm {
     FusionOperation
   >::CollectiveOp;
 
-  using CollectiveMainloopWithGroupWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    ElementA, LayoutA, AlignmentA,
-    ElementB, LayoutB, AlignmentB,
+    ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
+    ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
     ElementAccumulator,
     TileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
@@ -191,38 +166,26 @@ struct GroupScaleGemm {
     KernelSchedule
   >::CollectiveOp;
 
-  using GemmKernelDefault = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int,int,int,int>,
-      CollectiveMainloopWithGroupWiseScaling,
-      CollectiveEpilogue
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    cutlass::gemm::StreamKScheduler
   >;
 
-  using GemmKernelStreamK = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int,int,int,int>,
-      CollectiveMainloopWithGroupWiseScaling,
-      CollectiveEpilogue,
-      cutlass::gemm::StreamKScheduler
-  >;
-
-  using GemmDefault = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelDefault>;
-  using GemmStreamK = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelStreamK>;
-};
-
-using GroupScale1D1DGemm = GroupScaleGemm<GroupScale1D1DConfig>;
-using GroupScale1D2DGemm = GroupScaleGemm<GroupScale1D2DConfig>;
-using GroupScale2D1DGemm = GroupScaleGemm<GroupScale2D1DConfig>;
-using GroupScale2D2DGemm = GroupScaleGemm<GroupScale2D2DConfig>;
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
 // Extract information from Gemm kernel.
-using EpilogueOutputOp  = typename GroupScale1D1DGemm::GemmDefault::EpilogueOutputOp;
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
 using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
 using ElementAmax       = typename EpilogueOutputOp::ElementAmax;
 using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
 
-using StrideA = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideA;
-using StrideB = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideB;
-using StrideC = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideC;
-using StrideD = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideD;
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
 using StrideAux = StrideD;
 
 constexpr bool IsDFp8 =
@@ -242,20 +205,23 @@ StrideB stride_B;
 StrideC stride_C;
 StrideD stride_D;
 StrideAux stride_aux;
+LayoutSFA layout_SFA;
+LayoutSFB layout_SFB;
 uint64_t seed;
 
+using LayoutScalar = cutlass::layout::PackedVectorLayout;
+
 cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
 cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
 cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
 uint32_t mma_promotion_interval;
-cutlass::HostTensor<ElementBlockScale, LayoutA> blockscale_tensor_A;
-cutlass::HostTensor<ElementBlockScale, LayoutB> blockscale_tensor_B;
+cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_A;
+cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_B;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
 cutlass::HostTensor<ElementAux, LayoutAux> tensor_aux;
 cutlass::HostTensor<ElementAux, LayoutAux> tensor_ref_aux;
 
-using LayoutScalar = cutlass::layout::PackedVectorLayout;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_alpha;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_beta;
 cutlass::HostTensor<ElementScalar, LayoutScalar> scale_A;
@@ -392,32 +358,25 @@ bool initialize_scale_tensor(
 }
 
 /// Initialize operands to be used in the GEMM and reference GEMM
-template <typename GroupScaleConfig>
 void initialize(const Options<RasterOrderOptions> &options) {
 
-  using TileShape = typename GroupScaleConfig::TileShape;
-  const int ScaleGranularityM = GroupScaleConfig::ScaleGranularityM;
-  const int ScaleGranularityN = GroupScaleConfig::ScaleGranularityN;
-
   assert(options.m % ScaleGranularityM == 0);
   assert(options.n % ScaleGranularityN == 0);
 
-  // Find Group Scaling tensor shapes based on `ScaleGranularityM`, problem shape, and TileShape
-  auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
-  auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
-  auto blockscale_k = ceil_div(options.k, cute::get<2>(TileShape{}));
-
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
   stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
   stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
   stride_aux = stride_D;
+  layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
+  layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
+
 
   auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
   auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
   auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
-  auto groupscale_a_coord = cutlass::make_Coord(groupscale_m * options.l, blockscale_k);
-  auto groupscale_b_coord = cutlass::make_Coord(groupscale_n * options.l, blockscale_k);
+  auto groupscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
+  auto groupscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
 
   tensor_A.resize(a_coord);
   tensor_B.resize(b_coord);
@@ -522,7 +481,9 @@ GemmArguments args_from_options(const Options<RasterOrderOptions> &options)
      stride_B,
      mma_promotion_interval,
      blockscale_tensor_A.device_data(),
-     blockscale_tensor_B.device_data()
+     layout_SFA,
+     blockscale_tensor_B.device_data(),
+     layout_SFB
      },
     {
       {}, // epilogue.thread
@@ -572,19 +533,10 @@ GemmArguments args_from_options(const Options<RasterOrderOptions> &options)
 }
 
 /// Don't know why the compiler does not like verify() being templated...
-bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile, const int ScaleNsPerTile) {
+bool verify(const Options<RasterOrderOptions> &options) {
   //
   // Compute reference output
   //
-  const int ScaleGranularityM = get<0>(TileShape_{}) / ScaleMsPerTile;
-  const int ScaleGranularityN = get<1>(TileShape_{}) / ScaleNsPerTile;
-
-  // Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
-  auto blockscale_m = ceil_div(options.m, get<0>(TileShape_{}));
-  auto blockscale_n = ceil_div(options.n, get<1>(TileShape_{}));
-  auto blockscale_k = ceil_div(options.k, get<2>(TileShape_{}));
-  auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
-  auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
 
   // Create instantiation for device reference gemm kernel
   auto A = cute::make_tensor(tensor_A.host_data(),
@@ -618,28 +570,18 @@ bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile
                                 )
                               );
 
-  auto blockscale_A = cute::make_tensor(blockscale_tensor_A.host_data(),
-                                        cute::make_layout(
-                                          cute::make_shape(groupscale_m, blockscale_k, options.l),
-                                          cute::make_stride(1, groupscale_m, groupscale_m * blockscale_k)
-                                        )
-                                      );
-  auto blockscale_B = cute::make_tensor(blockscale_tensor_B.host_data(),
-                                        cute::make_layout(
-                                          cute::make_shape(groupscale_n, blockscale_k, options.l),
-                                          cute::make_stride(1, groupscale_n, groupscale_n * blockscale_k)
-                                        )
-                                      );
+  auto SFA = cute::make_tensor(blockscale_tensor_A.host_data(), layout_SFA);
+  auto SFB = cute::make_tensor(blockscale_tensor_B.host_data(), layout_SFB);
 
   using unused_t = decltype(D);
 
-  cutlass::reference::host::GettMainloopParams<ElementAccumulator,
-                                               decltype(A), decltype(B),
-                                               decltype(blockscale_A), decltype(blockscale_B),
-                                               TileShape_> mainloop_params{
-                                               A, B,                         // Operand Tensors
-                                               blockscale_A, blockscale_B    // Groupwise scaling Tensors
-                                              };
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
 
   cutlass::reference::host::GettEpilogueParams<
       ElementScalar,
@@ -713,14 +655,7 @@ bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile
 }
 
 /// Execute a given example GEMM computation
-template <typename GroupScaleConfig, typename Gemm>
-int run(Options<RasterOrderOptions> &options)
-{
-  using TileShape = typename GroupScaleConfig::TileShape;
-  const int ScaleGranularityM = GroupScaleConfig::ScaleGranularityM;
-  const int ScaleGranularityN = GroupScaleConfig::ScaleGranularityN;
-  const int ScaleMsPerTile    = GroupScaleConfig::ScaleMsPerTile;
-  const int ScaleNsPerTile    = GroupScaleConfig::ScaleNsPerTile;
+int run(Options<RasterOrderOptions> &options) {
 
   bool skip = false;
   std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
@@ -747,7 +682,7 @@ int run(Options<RasterOrderOptions> &options)
   if (!skip) std::cout << "  Running... " << std::endl;
   else return -1;
 
-  initialize<GroupScaleConfig>(options);
+  initialize(options);
 
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm;
@@ -773,7 +708,7 @@ int run(Options<RasterOrderOptions> &options)
   // Check if output from CUTLASS kernel and reference kernel are equal or not
   Result result;
   if (options.verify) {
-    result.passed = verify(options, ScaleMsPerTile, ScaleNsPerTile);
+    result.passed = verify(options);
 
     std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
   }
@@ -860,28 +795,7 @@ int main(int argc, char const **args) {
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
   bool passed = true;
-  std::cout << "Basic split-K GEMM kernel" << std::endl;
-  passed &= run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmDefault>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmDefault>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmDefault>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmDefault>(options);
-  std::cout << std::endl;
-
-  std::cout << std::endl;
-
-  std::cout << "StreamK GEMM kernel" << std::endl;
-  passed &= run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmStreamK>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmStreamK>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmStreamK>(options);
-  std::cout << std::endl;
-  passed &= run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmStreamK>(options);
-  std::cout << std::endl;
-
+  passed = run(options);
   if (!passed)
     return -1;
 #endif
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
deleted file mode 100644
index 8904060c..00000000
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
+++ /dev/null
@@ -1,504 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GETT in host-side code.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/relatively_equal.h"
-#include <iostream>
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-template<class T, class = void>
-struct ElementTraits {
-  using type = T;
-};
-
-template<class T>
-struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
-  using type = decltype(std::declval<T>().get());
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ElementAccumulator_,
-  class TensorA_,                                                                                         // (M, K, L)
-  class TensorB_,                                                                                         // (N, K, L)
-  class TensorScaleA_,                                                                                    // (m, k, L)
-  class TensorScaleB_,                                                                                    // (n, k, L)
-  class TileShape_
->
-struct GettMainloopParams {
-  using ElementAccumulator = ElementAccumulator_;
-  using TensorA = TensorA_;
-  using TensorB = TensorB_;
-  using EngineA = typename TensorA::engine_type;
-  using LayoutA = typename TensorA::layout_type;
-  using EngineB = typename TensorB::engine_type;
-  using LayoutB = typename TensorB::layout_type;
-
-  using TensorScaleA = TensorScaleA_;
-  using TensorScaleB = TensorScaleB_;
-  using TileShape = TileShape_;
-  using EngineScaleA = typename TensorScaleA::engine_type;
-  using EngineScaleB = typename TensorScaleB::engine_type;
-
-  TensorA A{};
-  TensorB B{};
-  TensorScaleA ScaleA{};
-  TensorScaleB ScaleB{};  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementScalar_,
-  class ElementScalingFactor_,
-  class ElementAccumulator_,
-  class ElementCompute_,
-  class TensorC_,                                                                                          // (M, N, L)
-  class TensorD_,                                                                                          // (M, N, L)
-  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
-  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
-  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
-  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
-  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
-  bool PerColumnBias_ = false
->
-struct GettEpilogueParams {
-  using ElementScalar = ElementScalar_;
-  using ElementScalingFactor = ElementScalingFactor_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using TensorC = TensorC_;
-  using TensorD = TensorD_;
-  using TensorAux = TensorAux_;
-  using VectorBias = VectorBias_;
-  using VectorAlpha = VectorAlpha_;
-  using VectorBeta = VectorBeta_;
-  using ActivationFunctor = ActivationFunctor_;
-  using BiasBinaryOp = BiasBinaryOp_;
-
-  using EngineC = typename TensorC::engine_type;
-  using LayoutC = typename TensorC::layout_type;
-  using EngineD =  typename TensorD::engine_type;
-  using LayoutD = typename TensorD::layout_type;
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorC C{};
-  TensorD D{};
-  VectorBias Bias{};
-  TensorAux Aux{};
-  VectorAlpha Valpha{};
-  VectorBeta Vbeta{};
-  ElementCompute st = ElementCompute(1);
-
-  ElementAccumulator* abs_max_D = nullptr;
-  ElementAccumulator* abs_max_Aux = nullptr;
-
-  ElementScalingFactor scale_a = ElementScalingFactor(1);
-  ElementScalingFactor scale_b = ElementScalingFactor(1);
-  ElementScalingFactor scale_c = ElementScalingFactor(1);
-  ElementScalingFactor scale_d = ElementScalingFactor(1);
-  ElementScalingFactor scale_aux = ElementScalingFactor(1);
-
-  bool beta_per_channel_scaling = false;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - General Tensor-Tensor contraction reference kernel with Blockwise scaling
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gett(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-
-  static int constexpr kBlockM = cute::get<0>(typename MainloopParams::TileShape{});
-  static int constexpr kBlockN = cute::get<1>(typename MainloopParams::TileShape{});
-  // printf("mainloop_params.ScaleA.layout()"); cute::print(mainloop_params.ScaleA.layout()); printf("\n");
-  // printf("mainloop_params.ScaleB.layout()"); cute::print(mainloop_params.ScaleB.layout()); printf("\n");
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(3)
-#endif
-  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
-    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
-      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
-        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
-        gett_mainloop(mainloop_params, m, n, l, acc);
-        gett_epilogue(epilogue_params, m, n, l, acc);
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Mainloop
-template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_mainloop(
-    MainloopParams const& mainloop_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
-  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
-  using ElementBlockScaleA = typename ElementTraits<typename MainloopParams::EngineScaleA::value_type>::type;
-  using ElementBlockScaleB = typename ElementTraits<typename MainloopParams::EngineScaleB::value_type>::type;
-
-  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
-  RingOp fma_op;
-
-  multiplies<ElementAccumulator> scale_op;
-
-  static int constexpr kBlockK = cute::get<2>(typename MainloopParams::TileShape{});;
-
-  // Tempo accumulators to seperate blockwise accumulation
-  typename MainloopParams::ElementAccumulator acc_temp[kBlockM][kBlockN];
-
-  // Zero out accumulators
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      acc_temp[m_b][n_b] = ElementAccumulator(0);
-    }
-  }
-
-  int64_t block_m = m / kBlockM;
-  int64_t block_n = n / kBlockN;
-  cute::Tensor blockscale_A = mainloop_params.ScaleA(block_m, _, l);
-  cute::Tensor blockscale_B = mainloop_params.ScaleB(block_n, _, l);
-
-  // Compute on this k-block
-  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
-
-    // Load Blockwise scaling factor from blockscale Tensors for A and B
-    int64_t block_k = k / kBlockK;
-    ElementBlockScaleA scale_a = blockscale_A[block_k];
-    ElementBlockScaleB scale_b = blockscale_B[block_k];
-
-    // Load A
-    ElementAccumulator a_frag[kBlockM];
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
-      } else {
-        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // Load B
-    ElementAccumulator b_frag[kBlockN];
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
-      } else {
-        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // do compute
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      for (int n_b = 0; n_b < kBlockN; ++n_b) {
-        acc_temp[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc_temp[m_b][n_b]);
-      }
-    }
-
-    // Apply Blockwise-scaling at kBlockK boundary
-    // (a) Apply block scaling factors on the partial accumulated results (acc_temp) at the kBlocK boundary 
-    // (b) Zero-out partial temporary (acc_temp),
-    // (c) Update permanent (accu)
-    if ((k+1) % kBlockK == 0) {
-      for (int m_b = 0; m_b < kBlockM; ++m_b) {
-        for (int n_b = 0; n_b < kBlockN; ++n_b) {
-          ElementAccumulator blockwise_scaled_accum = acc_temp[m_b][n_b] * scale_a * scale_b;
-          acc[m_b][n_b] = blockwise_scaled_accum + acc[m_b][n_b];
-          acc_temp[m_b][n_b] = ElementAccumulator(0); 
-        }
-      }
-    }
-
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Epilogue
-template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_epilogue(
-    EpilogueParams const& epilogue_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementCompute = typename EpilogueParams::ElementCompute;
-  using ElementC = typename EpilogueParams::TensorC::value_type;
-  using ElementD = typename EpilogueParams::TensorD::value_type;
-  using ElementAux = typename EpilogueParams::TensorAux::value_type;
-  using ElementBias = typename EpilogueParams::VectorBias::value_type;
-  using ElementScalar = typename EpilogueParams::ElementScalar;
-  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
-  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
-  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
-
-  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
-  constexpr bool IsScalingAndAmaxOutputNeeded = 
-      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
-
-  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
-      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
-
-  constexpr bool IsReLUAuxNeeded =
-      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
-       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
-      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
-  constexpr bool IsClamp =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
-
-  constexpr bool IsBackpropFusion =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
-  NumericConverter<ElementCompute, ElementC> source_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
-
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
-
-  // Abs max converter
-  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
-
-  // Output related converter
-  NumericConverter<ElementD, ElementCompute> destination_converter;
-  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
-  NumericConverter<ElementBias, ElementCompute> dBias_converter;
-
-  // Epilogue operations
-  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
-  multiplies<ElementCompute> mul;
-  plus<ElementCompute> add;
-
-  // Activation operation
-  ActivationFunctor activation;
-
-  // Bias binary operation
-  BiasBinaryOp bias_op;
-
-  // Do conversion
-  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
-  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
-  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
-  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
-  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
-  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
-  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
-
-  // Init local var
-  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
-  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
-
-  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-  converted_beta = mul(converted_beta, converted_scale_c);
-
-  ElementCompute inter_accum[kBlockM][kBlockN];
-
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    ElementCompute local_dBias = ElementCompute(0);
-
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
-        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
-        // per-row alpha
-        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
-        }
-        ElementCompute output = mul(converted_alpha, converted_acc);
-
-        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
-          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
-          output = bias_op(output, converted_bias);
-        }
-
-        if (raw_pointer_cast(epilogue_params.C.data())) {
-          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
-          // per-row beta
-          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
-          }
-          output = epilogue_fma(converted_beta, converted_src, output);
-        }
-
-        if constexpr (IsBackpropFusion) {
-          ElementAux aux_input = ElementAux(0);
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
-          }
-
-          output = activation(output, aux_source_converter(aux_input));
-          local_dBias = add(local_dBias, output);
-        }
-        else {
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            auto aux_output = output;
-            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
-              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
-            }
-
-            if constexpr (IsReLUAuxNeeded) {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
-            } else {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
-            }
-          }
-
-          if constexpr (IsClamp) { // Treat Clamp as ReLU
-            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
-          }
-          else {
-            output = activation(output);
-          }
-        }
-
-        if constexpr (IsScalingAndAmaxOutputNeeded) {
-          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-          local_abs_max_output = amax_op(local_abs_max_output, output);
-          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
-        }
-
-        inter_accum[m_b][n_b] = ElementCompute(output);
-      }
-    } // n_b
-
-    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
-      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
-        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
-        local_dBias = add(local_dBias, converted_dBias);
-        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
-      }
-    }
-  } // m_b
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
-      }
-    }
-  }
-
-#if defined(_OPENMP)
-  #pragma omp critical(Abs_Max_Data_Update)
-#endif
-  {
-    if constexpr (IsScalingAndAmaxOutputNeeded) {
-      if (epilogue_params.abs_max_D) {
-        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
-          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
-      }
-    }
-
-    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-      if (epilogue_params.abs_max_Aux) {
-        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
-            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM - General Matrix-Matrix contraction without conjugation options
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gemm3x(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-  using namespace cute;
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "Only Rank3 Tensors (M, K, Batch_Count) "
-                                                                     "with Batchmode are supported");
-  // Lower the Matrix-Multiplication with Blockwise scaling (Gemm3x) to a Tensor Contraction (Gett).
-  Gett(mainloop_params, epilogue_params);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
deleted file mode 100644
index 0bf90a41..00000000
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+++ /dev/null
@@ -1,518 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GETT in host-side code.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/relatively_equal.h"
-#include <iostream>
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-template<class T, class = void>
-struct ElementTraits {
-  using type = T;
-};
-
-template<class T>
-struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
-  using type = decltype(std::declval<T>().get());
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ElementAccumulator_,
-  class TensorA_,                                                                                         // (M, K, L)
-  class TensorB_,                                                                                         // (N, K, L)
-  class TensorScaleA_,                                                                                    // (m, k, L)
-  class TensorScaleB_,                                                                                    // (n, k, L)
-  class TileShape_
->
-struct GettMainloopParams {
-  using ElementAccumulator = ElementAccumulator_;
-  using TensorA = TensorA_;
-  using TensorB = TensorB_;
-  using EngineA = typename TensorA::engine_type;
-  using LayoutA = typename TensorA::layout_type;
-  using EngineB = typename TensorB::engine_type;
-  using LayoutB = typename TensorB::layout_type;
-
-  using TensorScaleA = TensorScaleA_;
-  using TensorScaleB = TensorScaleB_;
-  using TileShape = TileShape_;
-  using EngineScaleA = typename TensorScaleA::engine_type;
-  using EngineScaleB = typename TensorScaleB::engine_type;
-
-  TensorA A{};
-  TensorB B{};
-  TensorScaleA ScaleA{};
-  TensorScaleB ScaleB{};  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementScalar_,
-  class ElementScalingFactor_,
-  class ElementAccumulator_,
-  class ElementCompute_,
-  class TensorC_,                                                                                          // (M, N, L)
-  class TensorD_,                                                                                          // (M, N, L)
-  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
-  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
-  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
-  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
-  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
-  bool PerColumnBias_ = false
->
-struct GettEpilogueParams {
-  using ElementScalar = ElementScalar_;
-  using ElementScalingFactor = ElementScalingFactor_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using TensorC = TensorC_;
-  using TensorD = TensorD_;
-  using TensorAux = TensorAux_;
-  using VectorBias = VectorBias_;
-  using VectorAlpha = VectorAlpha_;
-  using VectorBeta = VectorBeta_;
-  using ActivationFunctor = ActivationFunctor_;
-  using BiasBinaryOp = BiasBinaryOp_;
-
-  using EngineC = typename TensorC::engine_type;
-  using LayoutC = typename TensorC::layout_type;
-  using EngineD =  typename TensorD::engine_type;
-  using LayoutD = typename TensorD::layout_type;
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorC C{};
-  TensorD D{};
-  VectorBias Bias{};
-  TensorAux Aux{};
-  VectorAlpha Valpha{};
-  VectorBeta Vbeta{};
-  ElementCompute st = ElementCompute(1);
-
-  ElementAccumulator* abs_max_D = nullptr;
-  ElementAccumulator* abs_max_Aux = nullptr;
-
-  ElementScalingFactor scale_a = ElementScalingFactor(1);
-  ElementScalingFactor scale_b = ElementScalingFactor(1);
-  ElementScalingFactor scale_c = ElementScalingFactor(1);
-  ElementScalingFactor scale_d = ElementScalingFactor(1);
-  ElementScalingFactor scale_aux = ElementScalingFactor(1);
-
-  bool beta_per_channel_scaling = false;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - General Tensor-Tensor contraction reference kernel with Groupwise scaling
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gett(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-
-  static int constexpr kBlockM = cute::get<0>(typename MainloopParams::TileShape{});
-  static int constexpr kBlockN = cute::get<1>(typename MainloopParams::TileShape{});
-  // printf("mainloop_params.ScaleA.layout()"); cute::print(mainloop_params.ScaleA.layout()); printf("\n");
-  // printf("mainloop_params.ScaleB.layout()"); cute::print(mainloop_params.ScaleB.layout()); printf("\n");
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(3)
-#endif
-  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
-    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
-      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
-        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
-        gett_mainloop(mainloop_params, m, n, l, acc);
-        gett_epilogue(epilogue_params, m, n, l, acc);
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Mainloop
-template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_mainloop(
-    MainloopParams const& mainloop_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
-  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
-  using ElementBlockScaleA = typename ElementTraits<typename MainloopParams::EngineScaleA::value_type>::type;
-  using ElementBlockScaleB = typename ElementTraits<typename MainloopParams::EngineScaleB::value_type>::type;
-
-  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
-  RingOp fma_op;
-
-  multiplies<ElementAccumulator> scale_op;
-
-  static int constexpr kBlockK = cute::get<2>(typename MainloopParams::TileShape{});;
-
-  // Tempo accumulators to seperate blockwise accumulation
-  typename MainloopParams::ElementAccumulator acc_temp[kBlockM][kBlockN];
-
-  // Zero out accumulators
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      acc_temp[m_b][n_b] = ElementAccumulator(0);
-    }
-  }
-
-  const int M = cute::size<0>(mainloop_params.A.layout());
-  const int N = cute::size<0>(mainloop_params.B.layout());
-  const int ScaleGranularityM = M / cute::size<0>(mainloop_params.ScaleA);
-  const int ScaleGranularityN = N / cute::size<0>(mainloop_params.ScaleB);
-  assert(ScaleGranularityM && M % ScaleGranularityM == 0
-    && "ScaleGranularityM must divide M");
-  assert(ScaleGranularityN && N % ScaleGranularityN == 0
-    && "ScaleGranularityN must divide N"); 
-
-  cute::Tensor blockscale_A = domain_offset(
-    make_coord(m / ScaleGranularityM, _0{}), mainloop_params.ScaleA(_, _, l));
-  cute::Tensor blockscale_B = domain_offset(
-    make_coord(n / ScaleGranularityN, _0{}), mainloop_params.ScaleB(_, _, l));
-
-  // Compute on this k-block
-  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
-
-    // Load Blockwise scaling factor from blockscale Tensors for B
-    int64_t block_k = k / kBlockK;
-    cute::Tensor scale_a = blockscale_A(_, block_k);
-    cute::Tensor scale_b = blockscale_B(_, block_k);
-
-    // Load A
-    ElementAccumulator a_frag[kBlockM];
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
-      } else {
-        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // Load B
-    ElementAccumulator b_frag[kBlockN];
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
-      } else {
-        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    int m_size = std::min(static_cast<int64_t>(kBlockM), cute::size<0>(mainloop_params.A.layout()) - m);
-    int n_size = std::min(static_cast<int64_t>(kBlockN), cute::size<0>(mainloop_params.B.layout()) - n);
-
-    // do compute
-    for (int m_b = 0; m_b < m_size; ++m_b) {
-      for (int n_b = 0; n_b < n_size; ++n_b) {
-        acc_temp[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc_temp[m_b][n_b]);
-      }
-    }
-
-    // Apply Groupwise-scaling at kBlockK boundary
-    // (a) Apply group and block scaling factors on the partial accumulated results (acc_temp) at the kBlocK boundary 
-    // (b) Zero-out partial temporary (acc_temp),
-    // (c) Update permanent (accu)
-    if ((k+1) % kBlockK == 0) {
-      for (int m_b = 0; m_b < m_size; ++m_b) {
-        auto scale_a_m_b = scale_a[m_b / ScaleGranularityM];
-        for (int n_b = 0; n_b < n_size; ++n_b) {
-          auto scale_b_n_b = scale_b[n_b / ScaleGranularityN];
-          ElementAccumulator blockwise_scaled_accum = acc_temp[m_b][n_b] * scale_a_m_b * scale_b_n_b;
-          acc[m_b][n_b] = blockwise_scaled_accum + acc[m_b][n_b];
-          acc_temp[m_b][n_b] = ElementAccumulator(0); 
-        }
-      }
-    }
-
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Epilogue
-template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_epilogue(
-    EpilogueParams const& epilogue_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementCompute = typename EpilogueParams::ElementCompute;
-  using ElementC = typename EpilogueParams::TensorC::value_type;
-  using ElementD = typename EpilogueParams::TensorD::value_type;
-  using ElementAux = typename EpilogueParams::TensorAux::value_type;
-  using ElementBias = typename EpilogueParams::VectorBias::value_type;
-  using ElementScalar = typename EpilogueParams::ElementScalar;
-  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
-  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
-  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
-
-  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
-  constexpr bool IsScalingAndAmaxOutputNeeded = 
-      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
-
-  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
-      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
-
-  constexpr bool IsReLUAuxNeeded =
-      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
-       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
-      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
-  constexpr bool IsClamp =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
-
-  constexpr bool IsBackpropFusion =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
-  NumericConverter<ElementCompute, ElementC> source_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
-
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
-
-  // Abs max converter
-  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
-
-  // Output related converter
-  NumericConverter<ElementD, ElementCompute> destination_converter;
-  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
-  NumericConverter<ElementBias, ElementCompute> dBias_converter;
-
-  // Epilogue operations
-  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
-  multiplies<ElementCompute> mul;
-  plus<ElementCompute> add;
-
-  // Activation operation
-  ActivationFunctor activation;
-
-  // Bias binary operation
-  BiasBinaryOp bias_op;
-
-  // Do conversion
-  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
-  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
-  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
-  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
-  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
-  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
-  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
-
-  // Init local var
-  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
-  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
-
-  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-  converted_beta = mul(converted_beta, converted_scale_c);
-
-  ElementCompute inter_accum[kBlockM][kBlockN];
-
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    ElementCompute local_dBias = ElementCompute(0);
-
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
-        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
-        // per-row alpha
-        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
-        }
-        ElementCompute output = mul(converted_alpha, converted_acc);
-
-        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
-          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
-          output = bias_op(output, converted_bias);
-        }
-
-        if (raw_pointer_cast(epilogue_params.C.data())) {
-          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
-          // per-row beta
-          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
-          }
-          output = epilogue_fma(converted_beta, converted_src, output);
-        }
-
-        if constexpr (IsBackpropFusion) {
-          ElementAux aux_input = ElementAux(0);
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
-          }
-
-          output = activation(output, aux_source_converter(aux_input));
-          local_dBias = add(local_dBias, output);
-        }
-        else {
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            auto aux_output = output;
-            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
-              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
-            }
-
-            if constexpr (IsReLUAuxNeeded) {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
-            } else {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
-            }
-          }
-
-          if constexpr (IsClamp) { // Treat Clamp as ReLU
-            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
-          }
-          else {
-            output = activation(output);
-          }
-        }
-
-        if constexpr (IsScalingAndAmaxOutputNeeded) {
-          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-          local_abs_max_output = amax_op(local_abs_max_output, output);
-          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
-        }
-
-        inter_accum[m_b][n_b] = ElementCompute(output);
-      }
-    } // n_b
-
-    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
-      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
-        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
-        local_dBias = add(local_dBias, converted_dBias);
-        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
-      }
-    }
-  } // m_b
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
-      }
-    }
-  }
-
-#if defined(_OPENMP)
-  #pragma omp critical(Abs_Max_Data_Update)
-#endif
-  {
-    if constexpr (IsScalingAndAmaxOutputNeeded) {
-      if (epilogue_params.abs_max_D) {
-        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
-          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
-      }
-    }
-
-    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-      if (epilogue_params.abs_max_Aux) {
-        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
-            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM - General Matrix-Matrix contraction without conjugation options
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gemm3x(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-  using namespace cute;
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "Only Rank3 Tensors (M, K, Batch_Count) "
-                                                                     "with Batchmode are supported");
-  // Lower the Matrix-Multiplication with Groupwise scaling (Gemm3x) to a Tensor Contraction (Gett).
-  Gett(mainloop_params, epilogue_params);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
index d20bad58..d14360de 100644
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
@@ -87,11 +87,11 @@
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
 
 // Includes from examples directory
 #include "helper.h"
 #include "hopper_fp8_commandline.hpp"
-#include "reference/host/gemm_with_groupwise_scaling.h"
 
 using namespace cute;
 
@@ -128,54 +128,29 @@ using ElementAccumulator  = float;                                          // E
 using ElementBlockScale   = float;                                          // Element type for blockscaling during accumulation
 using ElementCompute      = float;                                          // Element type for epilogue computation
 
-using TileShape_  = Shape<_128,_128,_128>;  // This one is just to make the compiler happy with verify()...
+using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
+using TileShape     = Shape<_128,_128,_128>;                        // Threadblock-level tile size
+using ClusterShape  = Shape<_1,_2,_1>;                              // Shape of the threadblocks in a cluster
 
-// ScaleGranularity{M,N}: number of {rows in A}/{columns in B} that share the same scaling factor
-// Given TileShape = Shape<_128,_128,_128>:
-//   ScaleGranularityM == 128 and ScaleGranularityN == 128 --> 2Dx2D (the shape of the scaling factor)
-//   ScaleGranularityM == 1   and ScaleGranularityN == 128 --> 1Dx2D scaling
-//   ScaleGranularityM == 128 and ScaleGranularityN == 1   --> 2Dx1D scaling
-//   ScaleGranularityM == 1   and ScaleGranularityN == 1   --> 1Dx1D scaling
-template <int ScaleGranularityM_, int ScaleGranularityN_>
-struct GroupScaleConfig {
-  using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
-  using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
-  using TileShape     = Shape<_128,_128,_128>;                        // Threadblock-level tile size
-  using ClusterShape  = Shape<_1,_2,_1>;                              // Shape of the threadblocks in a cluster
+constexpr int ScaleGranularityM = 1;
+constexpr int ScaleGranularityN = 128;
+constexpr int ScaleGranularityK = 128;
 
-  static constexpr int ScaleGranularityM = ScaleGranularityM_;
-  static constexpr int ScaleGranularityN = ScaleGranularityN_;
-  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
-  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
 
-  static_assert(size<0>(TileShape{}) == ScaleGranularityM * ScaleMsPerTile,
-              "FP8 scaling granularity must evenly divide tile shape along M.");
-  static_assert(size<1>(TileShape{}) == ScaleGranularityN * ScaleNsPerTile,
-              "FP8 scaling granularity must evenly divide tile shape along N.");
+using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
 
-  using KernelSchedule    = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>;
-  using EpilogueSchedule  = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
-  using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
-  using FusionOperation   = cutlass::epilogue::fusion::LinearCombination<ElementC, ElementAccumulator>;
-};
+using LayoutSFA     = decltype(ScaleConfig::deduce_layoutSFA());    // Layout type for SFA matrix operand
+using LayoutSFB     = decltype(ScaleConfig::deduce_layoutSFB());    // Layout type for SFB matrix operand
 
-using GroupScale1D1DConfig = GroupScaleConfig<                    1,                     1>;
-using GroupScale1D2DConfig = GroupScaleConfig<                    1, size<1>(TileShape_{})>;
-using GroupScale2D1DConfig = GroupScaleConfig<size<0>(TileShape_{}),                     1>;
-using GroupScale2D2DConfig = GroupScaleConfig<size<0>(TileShape_{}), size<1>(TileShape_{})>;
+using KernelSchedule    = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum;
+using EpilogueSchedule  = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
+using FusionOperation   = cutlass::epilogue::fusion::LinearCombination<ElementC, ElementAccumulator>;
 
-template <typename ScheduleConfig>
-struct GroupScaleGemm {
-  using ArchTag           = typename ScheduleConfig::ArchTag;
-  using OperatorClass     = typename ScheduleConfig::OperatorClass;
-  using TileShape         = typename ScheduleConfig::TileShape;
-  using ClusterShape      = typename ScheduleConfig::ClusterShape;
-  using KernelSchedule    = typename ScheduleConfig::KernelSchedule;
-  using EpilogueSchedule  = typename ScheduleConfig::EpilogueSchedule;
-  using EpilogueTileType  = typename ScheduleConfig::EpilogueTileType;
-  using FusionOperation   = typename ScheduleConfig::FusionOperation;
-
-  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
     TileShape, ClusterShape,
     EpilogueTileType,
@@ -186,10 +161,10 @@ struct GroupScaleGemm {
     FusionOperation
   >::CollectiveOp;
 
-  using CollectiveMainloopWithGroupWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
+using CollectiveMainloopWithGroupWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    ElementA, LayoutA *, AlignmentA,
-    ElementB, LayoutB *, AlignmentB,
+    ElementA, cute::tuple<LayoutA *, LayoutSFA *>, AlignmentA,
+    ElementB, cute::tuple<LayoutB *, LayoutSFB *>, AlignmentB,
     ElementAccumulator,
     TileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
@@ -198,29 +173,23 @@ struct GroupScaleGemm {
     KernelSchedule
   >::CollectiveOp;
 
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      ProblemShape,
-      CollectiveMainloopWithGroupWiseScaling,
-      CollectiveEpilogue
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloopWithGroupWiseScaling,
+    CollectiveEpilogue
   >;
 
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-};
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-using GroupScale1D1DGemm = GroupScaleGemm<GroupScale1D1DConfig>;
-using GroupScale1D2DGemm = GroupScaleGemm<GroupScale1D2DConfig>;
-using GroupScale2D1DGemm = GroupScaleGemm<GroupScale2D1DConfig>;
-using GroupScale2D2DGemm = GroupScaleGemm<GroupScale2D2DConfig>;
 
 // Extract information from Gemm kernel.
-using EpilogueOutputOp  = typename GroupScale1D1DGemm::Gemm::EpilogueOutputOp;
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
 using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
-using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
 
-using StrideA = typename GroupScale1D1DGemm::Gemm::GemmKernel::InternalStrideA;
-using StrideB = typename GroupScale1D1DGemm::Gemm::GemmKernel::InternalStrideB;
-using StrideC = typename GroupScale1D1DGemm::Gemm::GemmKernel::InternalStrideC;
-using StrideD = typename GroupScale1D1DGemm::Gemm::GemmKernel::InternalStrideD;
+using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+using StrideD = typename Gemm::GemmKernel::InternalStrideD;
 
 static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
              "ElementAccumulator and ElementBlockScale should be same datatype");
@@ -240,6 +209,8 @@ std::vector<StrideA> stride_A_host;
 std::vector<StrideB> stride_B_host;
 std::vector<StrideC> stride_C_host;
 std::vector<StrideD> stride_D_host;
+std::vector<LayoutSFA> layout_SFA_host;
+std::vector<LayoutSFB> layout_SFB_host;
 
 std::vector<ElementAccumulator> alpha_host;
 std::vector<ElementAccumulator> beta_host;
@@ -265,6 +236,8 @@ cutlass::DeviceAllocation<StrideA> stride_A;
 cutlass::DeviceAllocation<StrideB> stride_B;
 cutlass::DeviceAllocation<StrideC> stride_C;
 cutlass::DeviceAllocation<StrideD> stride_D;
+cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
+cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
 
 cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
 cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
@@ -343,10 +316,6 @@ bool initialize_block(
 template <typename OptionType>
 void allocate(const OptionType &options) {
 
-  using TileShape = typename OptionType::GroupScaleConfig::TileShape;
-  const int ScaleMsPerTile = OptionType::GroupScaleConfig::ScaleMsPerTile;
-  const int ScaleNsPerTile = OptionType::GroupScaleConfig::ScaleNsPerTile;
-
   int64_t total_elements_A = 0;
   int64_t total_elements_B = 0;
   int64_t total_elements_C = 0;
@@ -372,10 +341,8 @@ void allocate(const OptionType &options) {
     auto N = get<1>(problem);
     auto K = get<2>(problem);
 
-    auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(problem), TileShape{})));
-    auto groupscale_m = cute::get<0>(blockscale_shape) * ScaleMsPerTile; // We need to pad along M in scale tensor of A to prevent illegal memory access.
-    auto groupscale_n = cute::get<1>(blockscale_shape) * ScaleNsPerTile; // We need to pad along N in scale tensor of A to prevent illegal memory access.
-    auto blockscale_k = cute::get<2>(blockscale_shape);
+    auto group_layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
+    auto group_layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
 
     offset_A.push_back(total_elements_A);
     offset_B.push_back(total_elements_B);
@@ -388,8 +355,8 @@ void allocate(const OptionType &options) {
     int64_t elements_B = K * N;
     int64_t elements_C = M * N;
     int64_t elements_D = M * N;
-    int64_t elements_blockscale_A = groupscale_m * blockscale_k;
-    int64_t elements_blockscale_B = groupscale_n * blockscale_k;
+    int64_t elements_blockscale_A = size(filter_zeros(group_layout_SFA));
+    int64_t elements_blockscale_B = size(filter_zeros(group_layout_SFB));
 
     total_elements_A += elements_A;
     total_elements_B += elements_B;
@@ -402,6 +369,8 @@ void allocate(const OptionType &options) {
     stride_B_host.push_back(cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
     stride_C_host.push_back(cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
     stride_D_host.push_back(cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
+    layout_SFA_host.push_back(group_layout_SFA);
+    layout_SFB_host.push_back(group_layout_SFB);
 
   }
 
@@ -477,6 +446,12 @@ void initialize(const OptionType &options) {
   stride_D.reset(options.groups);
   stride_D.copy_from_host(stride_D_host.data());
 
+  layout_SFA.reset(options.groups);
+  layout_SFA.copy_from_host(layout_SFA_host.data());
+
+  layout_SFB.reset(options.groups);
+  layout_SFB.copy_from_host(layout_SFB_host.data());
+
   alpha_device.reset(options.groups);
   alpha_device.copy_from_host(ptr_alpha_host.data());
   beta_device.reset(options.groups);
@@ -500,14 +475,14 @@ GemmArguments args_from_options(const OptionType &options, bool host_problem_sha
   // Change device_id to another value if you are running on a machine with multiple GPUs and wish
   // to use a GPU other than that with device ID 0.
   int device_id = 0;
-  cutlass::KernelHardwareInfo kernel_hw_info = cutlass::KernelHardwareInfo::make_kernel_hardware_info<typename GroupScale1D1DGemm::Gemm::GemmKernel>(device_id);
+  cutlass::KernelHardwareInfo kernel_hw_info = cutlass::KernelHardwareInfo::make_kernel_hardware_info<typename Gemm::GemmKernel>(device_id);
 
   GemmArguments arguments{
     cutlass::gemm::GemmUniversalMode::kGrouped,
     {options.groups, problem_sizes.get(), host_problem_shapes_available ? options.problem_sizes_host.data() : (decltype(options.problem_sizes_host.data())) nullptr},
     {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get(),
-     ptr_blockscale_A.get(),
-     ptr_blockscale_B.get()
+     ptr_blockscale_A.get(), layout_SFA.get(),
+     ptr_blockscale_B.get(), layout_SFB.get()
     },
     {
       {}, // epilogue.thread
@@ -577,12 +552,6 @@ bool verify(const OptionType &options) {
     // Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
     auto [m, n, k] = options.problem_sizes_host.at(group_idx);
     auto gemm_problem_shape = cute::make_shape(m, n, k);
-    auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape_{})));
-    auto blockscale_m = cute::get<0>(blockscale_shape);
-    auto blockscale_n = cute::get<1>(blockscale_shape);
-    auto blockscale_k = cute::get<2>(blockscale_shape);
-    auto groupscale_m = blockscale_m * OptionType::GroupScaleConfig::ScaleMsPerTile;
-    auto groupscale_n = blockscale_n * OptionType::GroupScaleConfig::ScaleNsPerTile;
 
     // Create instantiation for device reference gemm kernel
     auto A = cute::make_tensor(block_A_host.data() + offset_A.at(group_idx),
@@ -610,32 +579,20 @@ bool verify(const OptionType &options) {
                                 )
                               );
 
-    auto blockscale_A = cute::make_tensor(blockscale_block_A_host.data() + offset_blockscale_A.at(group_idx),
-                                          cute::make_layout(
-                                            cute::make_shape(groupscale_m, blockscale_k, 1),
-                                            cute::make_stride(1, groupscale_m, groupscale_m * blockscale_k)
-                                          )
-                                        );
-    auto blockscale_B = cute::make_tensor(blockscale_block_B_host.data() + offset_blockscale_B.at(group_idx),
-                                          cute::make_layout(
-                                            cute::make_shape(groupscale_n, blockscale_k, 1),
-                                            cute::make_stride(1, groupscale_n, groupscale_n * blockscale_k)
-                                          )
-                                        );
+    auto SFA = cute::make_tensor(blockscale_block_A_host.data() + offset_blockscale_A.at(group_idx),
+                                 layout_SFA_host.at(group_idx));
+    auto SFB = cute::make_tensor(blockscale_block_B_host.data() + offset_blockscale_B.at(group_idx),
+                                 layout_SFB_host.at(group_idx));
 
     using unused_t = decltype(D);
 
-    cutlass::reference::host::GettMainloopParams<
+    cutlass::reference::host::GettBlockScalingMainloopParams<
       ElementAccumulator,
-      decltype(A),
+      decltype(A), 
+      decltype(SFA), 
       decltype(B),
-      decltype(blockscale_A),
-      decltype(blockscale_B),
-      TileShape_
-    > mainloop_params{
-        A, B,                         // Operand Tensors
-        blockscale_A, blockscale_B    // Groupwise scaling Tensors
-    };
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
 
     cutlass::reference::host::GettEpilogueParams<
         ElementScalar,
@@ -647,8 +604,7 @@ bool verify(const OptionType &options) {
         unused_t, // bias
         unused_t, // Aux
         unused_t, // valpha
-        unused_t, // vbeta
-        ActivationFunctor
+        unused_t  // vbeta
     > epilogue_params;
 
     epilogue_params.C = C;
@@ -679,15 +635,9 @@ bool verify(const OptionType &options) {
 }
 
 /// Execute a given example GEMM computation
-template <typename Gemm, typename OptionType>
+template <typename OptionType>
 int run(OptionType &options, bool host_problem_shapes_available = true)
 {
-  using TileShape = typename OptionType::GroupScaleConfig::TileShape;
-  const int ScaleGranularityM = OptionType::GroupScaleConfig::ScaleGranularityM;
-  const int ScaleGranularityN = OptionType::GroupScaleConfig::ScaleGranularityN;
-  const int ScaleMsPerTile    = OptionType::GroupScaleConfig::ScaleMsPerTile;
-  const int ScaleNsPerTile    = OptionType::GroupScaleConfig::ScaleNsPerTile;
-
   allocate(options);
   initialize(options);
 
@@ -797,18 +747,12 @@ int main(int argc, char const **args) {
   // Parse options
   //
 
-  Options<RasterOrderOptions, ProblemShape, GroupScale1D1DConfig> options_1d1d;
-  Options<RasterOrderOptions, ProblemShape, GroupScale1D2DConfig> options_1d2d;
-  Options<RasterOrderOptions, ProblemShape, GroupScale2D1DConfig> options_2d1d;
-  Options<RasterOrderOptions, ProblemShape, GroupScale2D2DConfig> options_2d2d;
+  Options<RasterOrderOptions, ProblemShape> options;
 
-  options_1d1d.parse(argc, args);
-  options_1d2d.parse(argc, args);
-  options_2d1d.parse(argc, args);
-  options_2d2d.parse(argc, args);
+  options.parse(argc, args);
 
-  if (options_1d1d.help) {
-    options_1d1d.print_usage(std::cout) << std::endl;
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
     return 0;
   }
 
@@ -816,22 +760,10 @@ int main(int argc, char const **args) {
   // Evaluate CUTLASS kernels
   //
 
-  auto run_tests = [&] (bool host_problem_shapes_available = true) {
-    std::cout << "Grouped GEMM kernel with 1D1D group scale" << std::endl;
-    run<GroupScale1D1DGemm::Gemm>(options_1d1d, host_problem_shapes_available);
-    std::cout << "Grouped GEMM kernel with 1D2D group scale" << std::endl;
-    run<GroupScale1D2DGemm::Gemm>(options_1d2d, host_problem_shapes_available);
-    std::cout << "Grouped GEMM kernel with 2D1D group scale" << std::endl;
-    run<GroupScale2D1DGemm::Gemm>(options_2d1d, host_problem_shapes_available);
-    std::cout << "Grouped GEMM kernel with 2D2D group scale" << std::endl;
-    run<GroupScale2D2DGemm::Gemm>(options_2d2d, host_problem_shapes_available);
-    std::cout << std::endl;
-  };
-
   std::cout << "Running tests with host problem shapes:" << std::endl;
-  run_tests(true);
+  run(options, true);
   std::cout << "Running tests without host problem shapes:" << std::endl;
-  run_tests(false);
+  run(options, false);
 
 #endif
 
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
new file mode 100644
index 00000000..2ea42bbf
--- /dev/null
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
@@ -0,0 +1,781 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped scale Hopper FP8 Grouped GEMM example using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+    This example demonstrates a grouped scaled FP8 Grouped GEMM using the new CUTLASS 3.0.
+    APIs on NVIDIA Hopper architecture. New features that will be showcased in this example are as follows:
+    1. NVIDIA Hopper architecture introduces a new series of tensor core instructions (GMMA)
+    which are more efficient than the Ampere tensor core instructions.
+    2. NVIDIA Hopper architecture includes new Tensor Memory Accelerator (TMA) unit to transfer large
+    blocks of data efficiently between global memory and shared memory. TMA also supports asynchronous
+    copies between thread blocks in a cluster. This example also showcases on-the-fly modification of TMA
+    descriptors to move between groups/problem_count (represented by groups).
+    3. This example uses the Warp Specialized kernel design (see /media/docs/efficient_gemm.md for details).
+    4. A simple way to tune the CTA rasterization direction and swizzle pattern of Hopper kernels. Both the
+    CTA rasterization direction and swizzle pattern impact cross-CTA locality of accesses. By tuning we can
+    improve performance.
+    5. This example is tuned specifically for the sparse groups case, where the number of active groups (groups
+    with non-zero problem count) is much smaller than the total number of groups.
+    Examples:
+      $ ./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups  \
+        --m=2816 --n=3072 --k=16384 --save_aux=false --save_amax=false \
+        --raster=h --swizzle=2 --benchmark=./test_benchmark.txt
+
+      Where the test_benchmark.txt may look as such:
+        0 256x512x128
+        1 256x512x512
+        2 512x256x128
+        3 256x256x128
+        4 256x512x1024
+        5 1024x512x128 and so on
+*/
+
+#include <iostream>
+#include <optional>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <cfloat>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+
+// Includes from examples directory
+#include "helper.h"
+#include "hopper_fp8_commandline.hpp"
+
+using namespace cute;
+
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::float_e4m3_t;                          // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C matrix configuration
+using         ElementC    = cutlass::float_e4m3_t;                          // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = AlignmentC;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementBlockScale   = float;                                          // Element type for blockscaling during accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+
+using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
+
+using TileShape  = Shape<_128,_128,_128>;  // This one is just to make the compiler happy with verify()...
+using ClusterShape  = Shape<_1,_1,_1>;                              // Shape of the threadblocks in a cluster
+
+static constexpr int ScaleGranularityM = 1;
+static constexpr int ScaleGranularityN = 128;
+static constexpr int ScaleGranularityK = 128;
+static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+using LayoutSFA     = decltype(ScaleConfig::deduce_layoutSFA());    // Layout type for SFA matrix operand
+using LayoutSFB     = decltype(ScaleConfig::deduce_layoutSFB());    // Layout type for SFB matrix operand
+
+
+using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
+using FusionOperation   = cutlass::epilogue::fusion::LinearCombination<ElementC, ElementAccumulator>;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+  ArchTag, OperatorClass,
+  TileShape, ClusterShape,
+  EpilogueTileType,
+  ElementAccumulator, ElementCompute,
+  ElementC, LayoutC *, AlignmentC,
+  ElementD, LayoutD *, AlignmentD,
+  EpilogueSchedule,
+  FusionOperation
+>::CollectiveOp;
+
+using CollectiveMainloopWithGroupWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
+  ArchTag, OperatorClass,
+  ElementA, cute::tuple<LayoutA *, LayoutSFA *>, AlignmentA,
+  ElementB, cute::tuple<LayoutB *, LayoutSFB *>, AlignmentB,
+  ElementAccumulator,
+  TileShape, ClusterShape,
+  cutlass::gemm::collective::StageCountAutoCarveout<
+    static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+  >,
+  KernelSchedule
+>::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloopWithGroupWiseScaling,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Extract information from Gemm kernel.
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
+using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
+using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
+
+using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+/// Initialization
+
+cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
+
+std::vector<int64_t> offset_A;
+std::vector<int64_t> offset_B;
+std::vector<int64_t> offset_C;
+std::vector<int64_t> offset_D;
+std::vector<int64_t> offset_blockscale_A;
+std::vector<int64_t> offset_blockscale_B;
+
+std::vector<StrideA> stride_A_host;
+std::vector<StrideB> stride_B_host;
+std::vector<StrideC> stride_C_host;
+std::vector<StrideD> stride_D_host;
+std::vector<LayoutSFA> layout_SFA_host;
+std::vector<LayoutSFB> layout_SFB_host;
+
+std::vector<ElementAccumulator> alpha_host;
+std::vector<ElementAccumulator> beta_host;
+
+uint64_t seed;
+
+cutlass::DeviceAllocation<ElementA> block_A;
+cutlass::DeviceAllocation<ElementB> block_B;
+cutlass::DeviceAllocation<ElementC> block_C;
+cutlass::DeviceAllocation<ElementD> block_D;
+cutlass::DeviceAllocation<ElementBlockScale> blockscale_block_A;
+cutlass::DeviceAllocation<ElementBlockScale> blockscale_block_B;
+
+cutlass::DeviceAllocation<const ElementA *> ptr_A;
+cutlass::DeviceAllocation<const ElementB *> ptr_B;
+cutlass::DeviceAllocation<const ElementC *> ptr_C;
+cutlass::DeviceAllocation<ElementD *> ptr_D;
+cutlass::DeviceAllocation<ElementD *> ptr_ref_D;
+cutlass::DeviceAllocation<const ElementBlockScale *> ptr_blockscale_A;
+cutlass::DeviceAllocation<const ElementBlockScale *> ptr_blockscale_B;
+
+cutlass::DeviceAllocation<StrideA> stride_A;
+cutlass::DeviceAllocation<StrideB> stride_B;
+cutlass::DeviceAllocation<StrideC> stride_C;
+cutlass::DeviceAllocation<StrideD> stride_D;
+cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
+cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
+
+cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
+cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
+cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
+cutlass::DeviceAllocation<ElementAccumulator> block_beta;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED) 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90GroupParams<Shape<int,int,int>>::RasterOrderOptions;
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  double gbps;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    double gbps = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), gbps(gbps), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element, class ScopeMin = std::nullopt_t, class ScopeMax = std::nullopt_t>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023,
+  ScopeMin scope_min = std::nullopt, ScopeMax scope_max = std::nullopt) {
+
+  double _scope_max, _scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+  if (bits_input == 1) {
+    _scope_max = 2;
+    _scope_min = 0;
+  } else if (bits_input <= 8) {
+    _scope_max = 2;
+    _scope_min = -2;
+  } else if (bits_input == 16) {
+    _scope_max = 5;
+    _scope_min = -5;
+  } else {
+    _scope_max = 8;
+    _scope_min = -8;
+  }
+  if constexpr (!std::is_same_v<ScopeMax, std::nullopt_t>) {
+    _scope_max = scope_max;
+  }
+  if constexpr (!std::is_same_v<ScopeMin, std::nullopt_t>) {
+    _scope_min = scope_min;
+  }
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, (Element) _scope_max, (Element) _scope_min, 0);
+
+  return true;
+}
+
+/// Allocates device-side data
+template <typename OptionType>
+void allocate(const OptionType &options) {
+
+  int64_t total_elements_A = 0;
+  int64_t total_elements_B = 0;
+  int64_t total_elements_C = 0;
+  int64_t total_elements_D = 0;
+  int64_t total_elements_blockscale_A = 0;
+  int64_t total_elements_blockscale_B = 0;
+
+  offset_A.clear();
+  offset_B.clear();
+  offset_C.clear();
+  offset_D.clear();
+  offset_blockscale_A.clear();
+  offset_blockscale_B.clear();
+  stride_A_host.clear();
+  stride_B_host.clear();
+  stride_C_host.clear();
+  stride_D_host.clear();
+  
+  for (int32_t i = 0; i < options.groups; ++i) {
+
+    auto problem = options.problem_sizes_after_alignment_host.at(i);
+    auto M = get<0>(problem);
+    auto N = get<1>(problem);
+    auto K = get<2>(problem);
+
+    auto group_layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
+    auto group_layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
+
+    offset_A.push_back(total_elements_A);
+    offset_B.push_back(total_elements_B);
+    offset_C.push_back(total_elements_C);
+    offset_D.push_back(total_elements_D);
+    offset_blockscale_A.push_back(total_elements_blockscale_A);
+    offset_blockscale_B.push_back(total_elements_blockscale_B);
+
+    int64_t elements_A = M * K;
+    int64_t elements_B = K * N;
+    int64_t elements_C = M * N;
+    int64_t elements_D = M * N;
+    int64_t elements_blockscale_A = size(filter_zeros(group_layout_SFA));
+    int64_t elements_blockscale_B = size(filter_zeros(group_layout_SFB));
+
+    total_elements_A += elements_A;
+    total_elements_B += elements_B;
+    total_elements_C += elements_C;
+    total_elements_D += elements_D;
+    total_elements_blockscale_A += elements_blockscale_A;
+    total_elements_blockscale_B += elements_blockscale_B;
+
+    stride_A_host.push_back(cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}));
+    stride_B_host.push_back(cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
+    stride_C_host.push_back(cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
+    stride_D_host.push_back(cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
+    layout_SFA_host.push_back(group_layout_SFA);
+    layout_SFB_host.push_back(group_layout_SFB);
+
+  }
+
+  block_A.reset(total_elements_A);
+  block_B.reset(total_elements_B);
+  block_C.reset(total_elements_C);
+  block_D.reset(total_elements_D);
+  block_alpha.reset(options.groups);
+  block_beta.reset(options.groups);
+  blockscale_block_A.reset(total_elements_blockscale_A);
+  blockscale_block_B.reset(total_elements_blockscale_B);
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+template <typename OptionType>
+void initialize(const OptionType &options) {
+
+  problem_sizes.reset(options.groups);
+  problem_sizes.copy_from_host(options.problem_sizes_after_alignment_host.data());
+
+  std::vector<ElementA *> ptr_A_host(options.groups);
+  std::vector<ElementB *> ptr_B_host(options.groups);
+  std::vector<ElementC *> ptr_C_host(options.groups);
+  std::vector<ElementD *> ptr_D_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
+  std::vector<ElementBlockScale *> ptr_blockscale_A_host(options.groups);
+  std::vector<ElementBlockScale *> ptr_blockscale_B_host(options.groups);
+
+  alpha_host.clear();
+  beta_host.clear();
+
+  for (int i = 0; i < options.groups; i++) {
+    ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+    ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+    ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+    ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    ptr_blockscale_A_host.at(i) = blockscale_block_A.get() + offset_blockscale_A.at(i);
+    ptr_blockscale_B_host.at(i) = blockscale_block_B.get() + offset_blockscale_B.at(i);
+    alpha_host.push_back((options.alpha == FLT_MAX) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
+    beta_host.push_back((options.beta == FLT_MAX) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
+    ptr_alpha_host.at(i) = block_alpha.get() + i;
+    ptr_beta_host.at(i) = block_beta.get() + i;
+  }
+
+  ptr_A.reset(options.groups);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  ptr_B.reset(options.groups);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  ptr_C.reset(options.groups);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  ptr_D.reset(options.groups);
+  ptr_D.copy_from_host(ptr_D_host.data());
+
+  ptr_blockscale_A.reset(options.groups);
+  ptr_blockscale_A.copy_from_host(ptr_blockscale_A_host.data());
+
+  ptr_blockscale_B.reset(options.groups);
+  ptr_blockscale_B.copy_from_host(ptr_blockscale_B_host.data());
+
+  stride_A.reset(options.groups);
+  stride_A.copy_from_host(stride_A_host.data());
+
+  stride_B.reset(options.groups);
+  stride_B.copy_from_host(stride_B_host.data());
+
+  stride_C.reset(options.groups);
+  stride_C.copy_from_host(stride_C_host.data());
+
+  stride_D.reset(options.groups);
+  stride_D.copy_from_host(stride_D_host.data());
+
+  layout_SFA.reset(options.groups);
+  layout_SFA.copy_from_host(layout_SFA_host.data());
+
+  layout_SFB.reset(options.groups);
+  layout_SFB.copy_from_host(layout_SFB_host.data());
+
+  alpha_device.reset(options.groups);
+  alpha_device.copy_from_host(ptr_alpha_host.data());
+  beta_device.reset(options.groups);
+  beta_device.copy_from_host(ptr_beta_host.data());
+
+  initialize_block(block_A, seed + 2022);
+  initialize_block(block_B, seed + 2023);
+  initialize_block(block_C, seed + 2024);
+  initialize_block(blockscale_block_A, seed + 2025, -1, 1);
+  initialize_block(blockscale_block_B, seed + 2026, -1, 1);
+
+  block_alpha.copy_from_host(alpha_host.data());
+  block_beta.copy_from_host(beta_host.data());
+
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template<typename GemmArguments, typename OptionType>
+GemmArguments args_from_options(const OptionType &options, bool host_problem_shapes_available = true)
+{
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  int device_id = 0;
+  cutlass::KernelHardwareInfo kernel_hw_info = cutlass::KernelHardwareInfo::make_kernel_hardware_info<typename Gemm::GemmKernel>(device_id);
+
+  GemmArguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGrouped,
+    {options.groups, problem_sizes.get(), host_problem_shapes_available ? options.problem_sizes_after_alignment_host.data() : (decltype(options.problem_sizes_after_alignment_host.data())) nullptr},
+    {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get(),
+     ptr_blockscale_A.get(), layout_SFA.get(),
+     ptr_blockscale_B.get(), layout_SFB.get()
+    },
+    {
+      {}, // epilogue.thread
+      ptr_C.get(), stride_C.get(),
+      ptr_D.get(), stride_D.get()
+    },
+    kernel_hw_info
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  if (options.alpha != FLT_MAX && options.beta != FLT_MAX) {
+    // If both alpha/beta are provided (via cmd line args) and are scalar, i.e., same alpha/beta applies to all batches.
+    fusion_args.alpha = options.alpha;
+    fusion_args.beta = options.beta;
+    fusion_args.alpha_ptr = nullptr;
+    fusion_args.beta_ptr = nullptr;
+    fusion_args.alpha_ptr_array = nullptr;
+    fusion_args.beta_ptr_array = nullptr;
+    // Single alpha and beta for all groups
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
+  }
+  else {
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups.
+    fusion_args.alpha = 0;
+    fusion_args.beta = 0;
+    fusion_args.alpha_ptr = nullptr;
+    fusion_args.beta_ptr = nullptr;
+    fusion_args.alpha_ptr_array = alpha_device.get();
+    fusion_args.beta_ptr_array = beta_device.get();
+    // One alpha and beta per each group
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+  }
+
+  arguments.scheduler.raster_order = options.raster;
+  // The tile scheduler will swizzle up to 8 and with the nearest multiple of 2 (i.e., 1, 2, 4, and 8)
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+
+  return arguments;
+}
+
+template <typename OptionType>
+bool verify(const OptionType &options) {
+
+  //
+  // Compute reference output
+  //
+
+  std::vector<ElementA> block_A_host(block_A.size());
+  std::vector<ElementB> block_B_host(block_B.size());
+  std::vector<ElementC> block_C_host(block_C.size());
+  std::vector<ElementD> block_D_host_kernel(block_D.size());
+  std::vector<ElementD> block_D_host_ref(block_D.size());
+  std::vector<ElementBlockScale> blockscale_block_A_host(blockscale_block_A.size());
+  std::vector<ElementBlockScale> blockscale_block_B_host(blockscale_block_B.size());
+
+  block_A.copy_to_host(block_A_host.data());
+  block_B.copy_to_host(block_B_host.data());
+  block_C.copy_to_host(block_C_host.data());
+  block_D.copy_to_host(block_D_host_kernel.data());
+  blockscale_block_A.copy_to_host(blockscale_block_A_host.data());
+  blockscale_block_B.copy_to_host(blockscale_block_B_host.data());
+
+  bool passed = true;
+  for (int group_idx = 0; group_idx < options.groups; group_idx++) {
+    // Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
+    auto [m, n, k] = options.problem_sizes_after_alignment_host.at(group_idx);
+    auto gemm_problem_shape = cute::make_shape(m, n, k);
+
+    // Create instantiation for device reference gemm kernel
+    auto A = cute::make_tensor(block_A_host.data() + offset_A.at(group_idx),
+                              cute::make_layout(
+                                  cute::make_shape(m, k, 1),
+                                  stride_A_host.at(group_idx)
+                                )
+                              );
+    auto B = cute::make_tensor(block_B_host.data() + offset_B.at(group_idx),
+                              cute::make_layout(
+                                cute::make_shape(n, k, 1),
+                                stride_B_host.at(group_idx)
+                                )
+                              );
+    auto C = cute::make_tensor(block_C_host.data() + offset_C.at(group_idx),
+                              cute::make_layout(
+                                  cute::make_shape(m, n, 1),
+                                  stride_C_host.at(group_idx)
+                                )
+                              );
+    auto D = cute::make_tensor(block_D_host_ref.data() + offset_D.at(group_idx),
+                              cute::make_layout(
+                                  cute::make_shape(m, n, 1),
+                                  stride_D_host.at(group_idx)
+                                )
+                              );
+
+    auto SFA = cute::make_tensor(blockscale_block_A_host.data() + offset_blockscale_A.at(group_idx),
+                                 layout_SFA_host.at(group_idx));
+    auto SFB = cute::make_tensor(blockscale_block_B_host.data() + offset_blockscale_B.at(group_idx),
+                                 layout_SFB_host.at(group_idx));
+
+    using unused_t = decltype(D);
+
+    cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
+
+    cutlass::reference::host::GettEpilogueParams<
+        ElementScalar,
+        ElementScalar,
+        ElementAccumulator,
+        ElementCompute,
+        decltype(C),
+        decltype(D)
+    > epilogue_params;
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha_host.at(group_idx);
+    epilogue_params.beta = beta_host.at(group_idx);
+
+    // get reference result
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    auto this_group_passed = std::equal(
+      // std::execution::par_unseq,
+      block_D_host_ref.data() + offset_D.at(group_idx),
+      block_D_host_ref.data() + offset_D.at(group_idx) + m * n,
+      block_D_host_kernel.data() + offset_D.at(group_idx)
+    );
+    
+    passed &= this_group_passed;
+
+#if 0
+    std::cout << "Group: " << group_idx << " M: " << m << " N: " << n << " K: " << k << " Status: " << this_group_passed << std::endl;
+#endif
+
+  }
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename OptionType>
+int run(OptionType &options, bool host_problem_shapes_available = true)
+{
+
+  allocate(options);
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<typename Gemm::Arguments>(options, host_problem_shapes_available);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+   exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+    result.gbps = options.template gbps<ElementA, 
+                                        ElementB, 
+                                        ElementC, 
+                                        ElementD, 
+                                        ElementBlockScale, 
+                                        TileShape, 
+                                        ScaleMsPerTile, 
+                                        ScaleNsPerTile>(result.avg_runtime_ms / 1000.0);
+
+    std::string raster = "Heuristic";
+
+    if (options.raster == RasterOrderOptions::AlongN) {
+      raster = "Along N";
+    }
+    else if (options.raster == RasterOrderOptions::AlongM) {
+      raster = "Along M";
+    }
+
+    std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
+    for (int32_t i = 0; i < options.groups; ++i) {
+      std::cout << "    " << options.problem_sizes_host.at(i);
+      std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl;
+    }
+    std::cout << "  Groups      : " << options.groups  << std::endl;
+    std::cout << "  Tile shape (M, N, K): " << size<0>(TileShape{}) << ", " << size<1>(TileShape{}) << ", " << size<2>(TileShape{}) << std::endl;
+    std::cout << "  ScaleGranularityM: " << ScaleGranularityM << " (ScaleMsPerTile: " << ScaleMsPerTile << ")" << std::endl;
+    std::cout << "  ScaleGranularityN: " << ScaleGranularityN << " (ScaleNsPerTile: " << ScaleNsPerTile << ")" << std::endl;
+    std::cout << "  Rasterization: " << raster << " with a maximum CTA swizzle of " << options.swizzle << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 3)) {
+    std::cerr << "This example requires CUDA 12.3 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major != 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+  //
+  // Parse options
+  //
+
+  Options<RasterOrderOptions, ProblemShape> options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+  run(options, true);
+
+  std::cout << "Running tests without host problem shapes:" << std::endl;
+  run(options, false);
+
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/CMakeLists.txt b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/CMakeLists.txt
index f88b3167..09d506de 100644
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/CMakeLists.txt
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/CMakeLists.txt
@@ -59,3 +59,26 @@ cutlass_example_add_executable(
   TEST_SMALL
   TEST_SMALL_LARGE_GROUP
   )
+
+# MSVC will fail to compile this example with the following error:
+# fatal error C1083: Cannot open source file: <Some Mojibake>: No such file or directory [...\examples\68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling\68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.vcxproj]
+# This is a known issue and we are working on a fix.
+if (NOT MSVC)
+
+cutlass_example_add_executable(
+  68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups
+  68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
+  TEST_COMMAND_OPTIONS
+  TEST_RANDOM
+  TEST_RANDOM_LARGE_GROUP
+  TEST_EPILOGUE
+  TEST_EPILOGUE_LARGE_GROUP
+  TEST_EPILOGUE_OP
+  TEST_EPILOGUE_OP_LARGE_GROUP
+  TEST_FIXED
+  TEST_FIXED_LARGE_GROUP
+  TEST_SMALL
+  TEST_SMALL_LARGE_GROUP
+  )
+
+endif()
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
index 3e425fe2..19497176 100644
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
@@ -30,12 +30,11 @@
  **************************************************************************************************/
 
 // Command line options parsing
-template<typename _RasterOrderOptions, typename _ProblemShape, typename _GroupScaleConfig>
+template<typename _RasterOrderOptions, typename _ProblemShape>
 struct Options {
 
   using RasterOrderOptions = _RasterOrderOptions;
   using ProblemShape = _ProblemShape;
-  using GroupScaleConfig = _GroupScaleConfig;
 
   bool help = false;
 
@@ -43,6 +42,7 @@ struct Options {
   int iterations = 1000;
   int m = 1024, n = 512, k = 1024, groups = 10;
   std::string benchmark_path;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_after_alignment_host;
   std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
   int const tma_alignment_bits = 128;
   int const alignment = tma_alignment_bits / cutlass::sizeof_bits<cutlass::float_e4m3_t>::value;
@@ -89,6 +89,7 @@ struct Options {
     // Decide how to initialize the problems
     if (!benchmark_path.empty()) {
       if (!benchmark_problems()) {
+        problem_sizes_after_alignment_host.clear();
         problem_sizes_host.clear();
         return;
       }
@@ -105,8 +106,8 @@ struct Options {
     cmd.get_cmd_line_argument("n", cmd_line_n);
     cmd.get_cmd_line_argument("k", cmd_line_k);
 
+    problem_sizes_after_alignment_host.reserve(groups);
     problem_sizes_host.reserve(groups);
-
     for (int i = groups; i > 0; i--) {
       int m = cmd_line_m;
       int n = cmd_line_n;
@@ -120,6 +121,7 @@ struct Options {
       if (k < 1) {
         k = k_alignment * ((rand() % (32 * alignment / k_alignment)) + 1);
       }
+      problem_sizes_after_alignment_host.push_back({m, n, k});
       problem_sizes_host.push_back({m, n, k});
     }
   }
@@ -142,7 +144,7 @@ struct Options {
         break;
       }
 
-      cutlass::gemm::GemmCoord extent;
+      cutlass::gemm::GemmCoord extent_after_alignment, extent;
       std::vector<std::string> tokens;
 
       cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
@@ -150,23 +152,81 @@ struct Options {
       for (int i = 0; i < int(tokens.size()); ++i) {
         int x = std::atoi(tokens.at(i).c_str());
 
+        extent.at(i) = x;
         // round up
         if (x % alignment) {
           x += (alignment - (x % alignment));
         }
 
-        extent.at(i) = x;
+        extent_after_alignment.at(i) = x;
       }
 
-      if (extent.product()) {
-        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
-      }
+      problem_sizes_after_alignment_host.push_back({extent_after_alignment.m(), extent_after_alignment.n(), extent_after_alignment.k()});
+      problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
     }
-    groups = static_cast<int>(problem_sizes_host.size());
+    groups = static_cast<int>(problem_sizes_after_alignment_host.size());
 
     return true;
   }
 
+  /// Calculate memory bandwidth statistics
+  template <class ElementA, 
+            class ElementB,
+            class ElementC,
+            class ElementD,
+            class ElementBlockScale,
+            class TileShape,
+            int ScaleMsPerTile,
+            int ScaleNsPerTile>
+  auto gbps(double runtime_s) const {
+    double total_read_bytes = 0;
+    double total_write_bytes = 0;
+    
+    // Calculate bytes read and written for each problem
+    for (int i = 0; i < groups; ++i) {
+      auto problem = problem_sizes_host.at(i);
+      auto M = cute::get<0>(problem);
+      auto N = cute::get<1>(problem);
+      auto K = cute::get<2>(problem);
+      
+      if (M > 0) {  // Only count active problems
+        // Matrix A: M*K elements read
+        total_read_bytes += M * K * sizeof(ElementA);
+        
+        // Matrix B: K*N elements read
+        total_read_bytes += K * N * sizeof(ElementB);
+        
+        // Matrix C: M*N elements read (for beta operation)
+        total_read_bytes += M * N * sizeof(ElementC);
+        
+        // Block scales for A and B
+        auto blockscale_shape = cute::shape(cute::get<1>(cute::zipped_divide(cute::make_layout(problem), TileShape{})));
+        auto blockscale_m = cute::get<0>(blockscale_shape);
+        auto blockscale_n = cute::get<1>(blockscale_shape);
+        auto blockscale_k = cute::get<2>(blockscale_shape);
+        auto groupscale_m = blockscale_m * ScaleMsPerTile;
+        auto groupscale_n = blockscale_n * ScaleNsPerTile;
+        
+        total_read_bytes += groupscale_m * blockscale_k * sizeof(ElementBlockScale);  // A scales
+        total_read_bytes += groupscale_n * blockscale_k * sizeof(ElementBlockScale);  // B scales
+        
+        // Matrix D: M*N elements written
+        total_write_bytes += M * N * sizeof(ElementD);
+      }
+    }
+
+    return (total_read_bytes + total_write_bytes) / 1.0e9 / runtime_s;
+  }
+
+  double bandwidth_util(double eff_bandwidth) const {
+    int memoryClockRate;
+    int memoryBusWidth;
+    cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, 0);
+    cudaDeviceGetAttribute(&memoryBusWidth, cudaDevAttrGlobalMemoryBusWidth , 0);
+    double bw = 2.0 * memoryClockRate * (memoryBusWidth / 8) / 1.0e6;
+    return eff_bandwidth / bw * 100.0;
+  }
+
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
deleted file mode 100644
index 1a94af67..00000000
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+++ /dev/null
@@ -1,520 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GETT in host-side code.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/relatively_equal.h"
-#include <iostream>
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-template<class T, class = void>
-struct ElementTraits {
-  using type = T;
-};
-
-template<class T>
-struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
-  using type = decltype(std::declval<T>().get());
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ElementAccumulator_,
-  class TensorA_,                                                                                         // (M, K, L)
-  class TensorB_,                                                                                         // (N, K, L)
-  class TensorScaleA_,                                                                                    // (m, k, L)
-  class TensorScaleB_,                                                                                    // (n, k, L)
-  class TileShape_
->
-struct GettMainloopParams {
-  using ElementAccumulator = ElementAccumulator_;
-  using TensorA = TensorA_;
-  using TensorB = TensorB_;
-  using EngineA = typename TensorA::engine_type;
-  using LayoutA = typename TensorA::layout_type;
-  using EngineB = typename TensorB::engine_type;
-  using LayoutB = typename TensorB::layout_type;
-
-  using TensorScaleA = TensorScaleA_;
-  using TensorScaleB = TensorScaleB_;
-  using TileShape = TileShape_;
-  using EngineScaleA = typename TensorScaleA::engine_type;
-  using EngineScaleB = typename TensorScaleB::engine_type;
-
-  TensorA A{};
-  TensorB B{};
-  TensorScaleA ScaleA{};
-  TensorScaleB ScaleB{};  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementScalar_,
-  class ElementScalingFactor_,
-  class ElementAccumulator_,
-  class ElementCompute_,
-  class TensorC_,                                                                                          // (M, N, L)
-  class TensorD_,                                                                                          // (M, N, L)
-  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
-  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
-  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
-  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
-  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
-  bool PerColumnBias_ = false
->
-struct GettEpilogueParams {
-  using ElementScalar = ElementScalar_;
-  using ElementScalingFactor = ElementScalingFactor_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using TensorC = TensorC_;
-  using TensorD = TensorD_;
-  using TensorAux = TensorAux_;
-  using VectorBias = VectorBias_;
-  using VectorAlpha = VectorAlpha_;
-  using VectorBeta = VectorBeta_;
-  using ActivationFunctor = ActivationFunctor_;
-  using BiasBinaryOp = BiasBinaryOp_;
-
-  using EngineC = typename TensorC::engine_type;
-  using LayoutC = typename TensorC::layout_type;
-  using EngineD =  typename TensorD::engine_type;
-  using LayoutD = typename TensorD::layout_type;
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorC C{};
-  TensorD D{};
-  VectorBias Bias{};
-  TensorAux Aux{};
-  VectorAlpha Valpha{};
-  VectorBeta Vbeta{};
-  ElementCompute st = ElementCompute(1);
-
-  ElementAccumulator* abs_max_D = nullptr;
-  ElementAccumulator* abs_max_Aux = nullptr;
-
-  ElementScalingFactor scale_a = ElementScalingFactor(1);
-  ElementScalingFactor scale_b = ElementScalingFactor(1);
-  ElementScalingFactor scale_c = ElementScalingFactor(1);
-  ElementScalingFactor scale_d = ElementScalingFactor(1);
-  ElementScalingFactor scale_aux = ElementScalingFactor(1);
-
-  bool beta_per_channel_scaling = false;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - General Tensor-Tensor contraction reference kernel with Groupwise scaling
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gett(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-
-  static int constexpr kBlockM = cute::get<0>(typename MainloopParams::TileShape{});
-  static int constexpr kBlockN = cute::get<1>(typename MainloopParams::TileShape{});
-  // printf("mainloop_params.ScaleA.layout()"); cute::print(mainloop_params.ScaleA.layout()); printf("\n");
-  // printf("mainloop_params.ScaleB.layout()"); cute::print(mainloop_params.ScaleB.layout()); printf("\n");
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(3)
-#endif
-  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
-    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
-      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
-        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
-        gett_mainloop(mainloop_params, m, n, l, acc);
-        gett_epilogue(epilogue_params, m, n, l, acc);
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Mainloop
-template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_mainloop(
-    MainloopParams const& mainloop_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
-  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
-  using ElementBlockScaleA = typename ElementTraits<typename MainloopParams::EngineScaleA::value_type>::type;
-  using ElementBlockScaleB = typename ElementTraits<typename MainloopParams::EngineScaleB::value_type>::type;
-
-  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
-  RingOp fma_op;
-
-  multiplies<ElementAccumulator> scale_op;
-
-  static int constexpr kBlockK = cute::get<2>(typename MainloopParams::TileShape{});;
-
-  // Tempo accumulators to seperate blockwise accumulation
-  typename MainloopParams::ElementAccumulator acc_temp[kBlockM][kBlockN];
-
-  // Zero out accumulators
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      acc_temp[m_b][n_b] = ElementAccumulator(0);
-    }
-  }
-
-  const int M = cute::size<0>(mainloop_params.A.layout());
-  const int N = cute::size<0>(mainloop_params.B.layout());
-
-  const int ScaleGranularityM = M / cute::size<0>(mainloop_params.ScaleA.layout());
-  const int ScaleGranularityN = N / cute::size<0>(mainloop_params.ScaleB.layout());
-
-  assert(ScaleGranularityM && M % ScaleGranularityM == 0 && "ScaleGranularityM must divide M");
-  assert(ScaleGranularityN && N % ScaleGranularityN == 0 && "ScaleGranularityN must divide N");
-
-  cute::Tensor blockscale_A = domain_offset(make_coord(m / ScaleGranularityM, _0{}), mainloop_params.ScaleA(_, _, l));
-  cute::Tensor blockscale_B = domain_offset(make_coord(n / ScaleGranularityN, _0{}), mainloop_params.ScaleB(_, _, l));
-
-  // Compute on this k-block
-  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
-
-    // Load Blockwise scaling factor from blockscale Tensors for B
-    int64_t block_k = k / kBlockK;
-    cute::Tensor scale_a = blockscale_A(_, block_k);
-    cute::Tensor scale_b = blockscale_B(_, block_k);
-
-    // Load A
-    ElementAccumulator a_frag[kBlockM];
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
-      } else {
-        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // Load B
-    ElementAccumulator b_frag[kBlockN];
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
-      } else {
-        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // do compute
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      for (int n_b = 0; n_b < kBlockN; ++n_b) {
-        acc_temp[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc_temp[m_b][n_b]);
-      }
-    }
-
-    // Apply Groupwise-scaling at kBlockK boundary
-    // (a) Apply group and block scaling factors on the partial accumulated results (acc_temp) at the kBlocK boundary 
-    // (b) Zero-out partial temporary (acc_temp),
-    // (c) Update permanent (accu)
-    if ((k+1) % kBlockK == 0) {
-      for (int m_b = 0; m_b < kBlockM; ++m_b) {
-        auto scale_a_m_b = scale_a[m_b / ScaleGranularityM];
-        for (int n_b = 0; n_b < kBlockN; ++n_b) {
-          auto scale_b_n_b = scale_b[n_b / ScaleGranularityN];
-          ElementAccumulator blockwise_scaled_accum = acc_temp[m_b][n_b] * scale_a_m_b * scale_b_n_b;
-          acc[m_b][n_b] = blockwise_scaled_accum + acc[m_b][n_b];
-          acc_temp[m_b][n_b] = ElementAccumulator(0); 
-        }
-      }
-    }
-
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Epilogue
-template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_epilogue(
-    EpilogueParams const& epilogue_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementCompute = typename EpilogueParams::ElementCompute;
-  using ElementC = typename EpilogueParams::TensorC::value_type;
-  using ElementD = typename EpilogueParams::TensorD::value_type;
-  using ElementAux = typename EpilogueParams::TensorAux::value_type;
-  using ElementBias = typename EpilogueParams::VectorBias::value_type;
-  using ElementScalar = typename EpilogueParams::ElementScalar;
-  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
-  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
-  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
-
-  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
-  constexpr bool IsScalingAndAmaxOutputNeeded = 
-      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
-
-  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
-      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
-
-  constexpr bool IsReLUAuxNeeded =
-      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
-       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
-      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
-  constexpr bool IsClamp =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
-
-  constexpr bool IsBackpropFusion =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
-  NumericConverter<ElementCompute, ElementC> source_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
-
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
-
-  // Abs max converter
-  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
-
-  // Output related converter
-  NumericConverter<ElementD, ElementCompute> destination_converter;
-  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
-  NumericConverter<ElementBias, ElementCompute> dBias_converter;
-
-  // Epilogue operations
-  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
-  multiplies<ElementCompute> mul;
-  plus<ElementCompute> add;
-
-  // Activation operation
-
-  auto activation = [] (ElementCompute x, ElementCompute y = ElementCompute(0)) {
-    if constexpr (std::is_same_v<ActivationFunctor, void>) {
-      return x + y;
-    } else {
-      return ActivationFunctor()(x, y);
-    }
-  };
-
-  // Bias binary operation
-  BiasBinaryOp bias_op;
-
-  // Do conversion
-  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
-  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
-  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
-  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
-  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
-  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
-  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
-
-  // Init local var
-  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
-  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
-
-  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-  converted_beta = mul(converted_beta, converted_scale_c);
-
-  ElementCompute inter_accum[kBlockM][kBlockN];
-
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    ElementCompute local_dBias = ElementCompute(0);
-
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
-        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
-        // per-row alpha
-        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
-        }
-        ElementCompute output = mul(converted_alpha, converted_acc);
-
-        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
-          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
-          output = bias_op(output, converted_bias);
-        }
-
-        if (raw_pointer_cast(epilogue_params.C.data())) {
-          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
-          // per-row beta
-          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
-          }
-          output = epilogue_fma(converted_beta, converted_src, output);
-        }
-
-        if constexpr (IsBackpropFusion) {
-          ElementAux aux_input = ElementAux(0);
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
-          }
-
-          output = activation(output, aux_source_converter(aux_input));
-          local_dBias = add(local_dBias, output);
-        }
-        else {
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            auto aux_output = output;
-            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
-              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
-            }
-
-            if constexpr (IsReLUAuxNeeded) {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
-            } else {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
-            }
-          }
-
-          if constexpr (IsClamp) { // Treat Clamp as ReLU
-            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
-          }
-          else {
-            output = activation(output);
-          }
-        }
-
-        if constexpr (IsScalingAndAmaxOutputNeeded) {
-          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-          local_abs_max_output = amax_op(local_abs_max_output, output);
-          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
-        }
-
-        inter_accum[m_b][n_b] = ElementCompute(output);
-      }
-    } // n_b
-
-    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
-      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
-        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
-        local_dBias = add(local_dBias, converted_dBias);
-        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
-      }
-    }
-  } // m_b
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
-      }
-    }
-  }
-
-#if defined(_OPENMP)
-  #pragma omp critical(Abs_Max_Data_Update)
-#endif
-  {
-    if constexpr (IsScalingAndAmaxOutputNeeded) {
-      if (epilogue_params.abs_max_D) {
-        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
-          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
-      }
-    }
-
-    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-      if (epilogue_params.abs_max_Aux) {
-        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
-            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM - General Matrix-Matrix contraction without conjugation options
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gemm3x(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-  using namespace cute;
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "Only Rank3 Tensors (M, K, Batch_Count) "
-                                                                     "with Batchmode are supported");
-  // Lower the Matrix-Multiplication with Groupwise scaling (Gemm3x) to a Tensor Contraction (Gett).
-  Gett(mainloop_params, epilogue_params);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
index 75d3437d..8be4f639 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@@ -480,7 +480,12 @@ bool verify(const Options &options) {
   passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
   passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
 
-  return passed;
+  block_SFD.sync_host();
+  bool passed_sfd = cutlass::reference::host::TensorEquals(block_reference_SFD.host_view(), block_SFD.host_view());
+  passed_sfd &= (cutlass::reference::host::TensorNorm(block_reference_SFD.host_view()) > 0);
+  passed_sfd &= (cutlass::reference::host::TensorNorm(block_SFD.host_view()) > 0);
+
+  return passed && passed_sfd;
 }
 
 /// Execute a given example GEMM computation
diff --git a/examples/77_blackwell_fmha/77_blackwell_fmha.cu b/examples/77_blackwell_fmha/77_blackwell_fmha.cu
index 1d1314d1..c8792122 100644
--- a/examples/77_blackwell_fmha/77_blackwell_fmha.cu
+++ b/examples/77_blackwell_fmha/77_blackwell_fmha.cu
@@ -67,9 +67,6 @@
             --b=2048 --h=2048 --d=2048 --q=2048 --k=2048
 */
 
-#define DSHOW(x) print(#x ": "); print(x); print("\n");
-#define DSHOWT(x) print(#x ": "); print_tensor(x); print("\n");
-
 #include <iostream>
 #include <random>
 #include <regex>
@@ -247,8 +244,8 @@ struct Options {
       << "                              and are split B-ways, alternatingly +10% and -10%\n"
       << "                              with the last batch sized to make it fit\n"
       << "                              implies at least residual masking for correctness\n"
-      << " --sm-count                   Sets SM count rather than querying it\n"
-      << " --kernel-filter=<filter>     Sets regexp to match kernel against\n"
+      << "  --sm-count                  Sets SM count rather than querying it\n"
+      << "  --kernel-filter=<filter>    Sets regexp to match kernel against\n"
       << "\n";
 
     return out;
diff --git a/examples/77_blackwell_fmha/77_blackwell_fmha_bwd.cu b/examples/77_blackwell_fmha/77_blackwell_fmha_bwd.cu
new file mode 100644
index 00000000..1c02a29e
--- /dev/null
+++ b/examples/77_blackwell_fmha/77_blackwell_fmha_bwd.cu
@@ -0,0 +1,865 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Example implementation of fused multi-head attention for Blackwell using CUTLASS 3.
+
+    This example showcases the use of CUTLASS to build backward fused
+    multi-head attantion (FMHA) collectives from existing CUTLASS collectives targeting
+    the NVIDIA Blackwell architecture.
+
+    Background and motivation
+    -------------------------
+    CUTLASS is a highly flexible library that provides open-source building blocks
+    for tensor core programming for GEMM or GEMM-like problems. Fused multi-head
+    attention (FMHA) is a foundational kernel for large language models (LLMs) since it
+    makes long sequence lengths feasible from a memory-usage perspective. It also
+    improves computational efficiency since it transforms an outer-product-like and
+    a matrix-vector-like GEMM into a fused operation with much higher arithmetic
+    intensity. For more details, see Dao et al, 2022; Dao, 2023.
+    Implementing this kernel in CUTLASS enabled easy customization and high
+    performance.
+
+    Introduction
+    ------------
+    The example targets the NVIDIA Blackwell architecture, and takes advantage of
+    5th gen tensor cores and the Tensor Memory Accelerator (TMA), just like
+    GEMMs do. It provides a backward pass (often abbreviated
+    bwd in the code).
+    The code is structured into three layers: The runner (and the reference kernels)
+    takes care of initialization, measurement, and testing; the device layer
+    orchestrates kernel calls and partitions workspace; and the kernel layer (just
+    like the CUTLASS kernel layer.
+
+    Support
+    ---------
+
+    We support fp16 and fp8 data types with a head dimension of 128.
+
+    Example usage:
+    $ ./examples/77_blackwell_fmha/77_blackwell_fmha_bwd_fp16 \
+            --b=2048 --h=2048 --d=2048 --q=2048 --k=2048
+*/
+
+#include <iostream>
+#include <random>
+#include <regex>
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "reference/fmha_fwd_reference.hpp"
+#include "reference/fmha_bwd_reference.hpp"
+#include "reference/reference_abs_error.hpp"
+
+#include "collective/fmha_fusion.hpp"
+#include "device/fmha_device_bwd.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+using namespace cutlass::fmha::collective;
+using namespace cutlass::fmha;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class InitStyle {
+  kOne, kZero, kLinearStride128, kLinearStride1, kRandom, kNone
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Command line options parsing
+struct Options {
+
+  bool help = false;
+  bool error = false;
+
+  int b = 16;
+  int h = 16;
+  int h_k = 1;
+  int q = 1024;
+  int k = 1024;
+  int d = 128;
+  int iterations = 3;
+  bool verify = false;
+  bool verbose = false;
+
+  bool causal = false;
+  int sm_count = 0;
+
+  std::string kernel_filter;
+
+  InitStyle init_style_q = InitStyle::kRandom;
+  InitStyle init_style_k = InitStyle::kRandom;
+  InitStyle init_style_v = InitStyle::kRandom;
+  InitStyle init_style_do = InitStyle::kRandom;
+  bool skip_reference = false;
+
+  static void get_init_style_argument(cutlass::CommandLine& cmd, const char* name, InitStyle& dst, InitStyle const& src) {
+    std::string s;
+    cmd.get_cmd_line_argument(name, s, s);
+    if (s.empty()) {
+      dst = src;
+    }
+    else {
+      if (s == "r") {
+        dst = InitStyle::kRandom;
+      }
+      else if (s == "0") {
+        dst = InitStyle::kZero;
+      }
+      else if (s == "1") {
+        dst = InitStyle::kOne;
+      }
+      else if (s == "d") {
+        dst = InitStyle::kLinearStride1;
+      }
+      else if (s == "s") {
+        dst = InitStyle::kLinearStride128;
+      }
+      else if (s == "n") {
+        dst = InitStyle::kNone;
+      }
+      else {
+        std::cout << "Error: " << s << " is not a valid input type.\n";
+        std::exit(-1);
+      }
+    }
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    Options defaults;
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("d", d, defaults.d);
+    cmd.get_cmd_line_argument("h", h, -1);
+    if (h == -1) h = 2048 / d;
+
+    cmd.get_cmd_line_argument("q", q, -1);
+    cmd.get_cmd_line_argument("k", k, -1);
+    if (q == -1) q = k;
+    if (k == -1) k = q;
+    if (q == -1 && k == -1) q = k = defaults.q;
+
+    cmd.get_cmd_line_argument("b", b, -1);
+    if (b == -1) b = 16384 / k;
+    if (b == 0) b = 1;
+
+    cmd.get_cmd_line_argument("iterations", iterations, defaults.iterations);
+    verify = cmd.check_cmd_line_flag("verify");
+    verbose = cmd.check_cmd_line_flag("verbose");
+    std::string mask;
+    cmd.get_cmd_line_argument<std::string>("mask", mask, "");
+    if (mask == "causal") {
+      causal = true;
+    }
+    else {
+      causal = defaults.causal;
+    }
+
+    skip_reference = cmd.check_cmd_line_flag("skip-reference");
+    cmd.get_cmd_line_argument("sm-count", sm_count, defaults.sm_count);
+
+    get_init_style_argument(cmd, "init-style", init_style_q, defaults.init_style_q);
+    get_init_style_argument(cmd, "init-style", init_style_k, defaults.init_style_k);
+    get_init_style_argument(cmd, "init-style", init_style_v, defaults.init_style_v);
+    get_init_style_argument(cmd, "init-style", init_style_do, defaults.init_style_do);
+    get_init_style_argument(cmd, "init-style-q", init_style_q, init_style_q);
+    get_init_style_argument(cmd, "init-style-k", init_style_k, init_style_k);
+    get_init_style_argument(cmd, "init-style-v", init_style_v, init_style_v);
+    get_init_style_argument(cmd, "init-style-do", init_style_v, init_style_do);
+
+    cmd.get_cmd_line_argument("kernel-filter", kernel_filter, defaults.kernel_filter);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "77_blackwell_fmha_bwd\n\n"
+      << "  This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
+      << "  fused multi-head attention kernels for the backward pass targeting NVIDIA's Blackwell architecture.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --b=<int>                   Sets the B extent\n"
+      << "  --h=<int>                   Sets the H extent\n"
+      << "  --q=<int>                   Sets the Q extent\n"
+      << "  --k=<int>                   Sets the K extent\n"
+      << "  --d=<int>                   Sets the D extentn"
+      << "  --iterations=<int>          Benchmarking iterations\n"
+      << "  --verify                    Verify results\n"
+      << "  --verbose                   Print smem and execution time per kernel\n"
+      << "  --mask=<no|causal>          Enables masking\n"
+      << "  --sm-count                  Sets SM count rather than querying it\n"
+      << "  --kernel-filter=<filter>    Sets regexp to match kernel against\n"
+      << "\n";
+
+    return out;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+void initialize_block(
+    DeviceAllocation<Element>& block,
+    uint64_t seed=2023, InitStyle init_style = InitStyle::kRandom) {
+
+  switch (init_style) {
+    case InitStyle::kOne: {
+      cutlass::reference::device::BlockFillRandomUniform(
+        block.get(), block.size(), seed, (Element) 1, (Element) 1);
+      break;
+    }
+    case InitStyle::kZero: {
+      cutlass::reference::device::BlockFillRandomUniform(
+        block.get(), block.size(), seed, (Element) 0, (Element) 0);
+      break;
+    }
+    case InitStyle::kRandom: {
+      cutlass::reference::device::BlockFillRandomGaussian(
+        block.get(), block.size(), seed, (Element) 0, (Element) 1);
+      break;
+    }
+    case InitStyle::kLinearStride1: {
+      std::vector<Element> data(block.size());
+      for (size_t i = 0; i < block.size() / 128; i ++) {
+        for (int j = 0; j < 128; j++) {
+          data[j + 128*i] = static_cast<Element>((double) (j % 4));
+        }
+      }
+      block.copy_from_host(data.data(), data.size());
+      break;
+    }
+    case InitStyle::kLinearStride128: {
+      std::vector<Element> data(block.size());
+      for (size_t i = 0; i < block.size() / 128; i ++) {
+        for (int j = 0; j < 128; j++) {
+          data[j + 128*i] = static_cast<Element>((double) (i % 4));
+        }
+      }
+      block.copy_from_host(data.data(), data.size());
+      break;
+    }
+    case InitStyle::kNone: {
+      break;
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ExampleResult {
+  bool passed = false;
+  bool verified = false;
+  float runtime_ms = 0;
+  double tflops_tc_s = 0;
+  size_t smem_size = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class TileShape,
+  class DispatchPolicy,
+  class ActiveMask,
+  class... KernelOptions
+>
+struct BwdRunner {
+
+#ifdef FP8
+  using Element = cutlass::float_e4m3_t;
+#else
+  using Element = cutlass::half_t;
+#endif
+  using ElementAccumulator = float;
+
+  // Q K D (H B)
+  using ProblemShapeType = cute::tuple<int, int, int, cute::tuple<int, int>>;
+
+  using Operation = cutlass::fmha::device::Sm100FmhaBwd<Element, ElementAccumulator, TileShape, ActiveMask>;
+  
+  using TensorStride = Stride<int, _1, Stride<int, int>>; // Seq D (H B)
+  using StrideQ = TensorStride;
+  using StrideK = TensorStride;
+  using StrideV = TensorStride;
+  using StrideO = TensorStride;
+  using StrideLSE = Stride<_1, Stride<int, int>>; // Seq (H B)
+
+  // Backwards specific
+  using StrideDQ = TensorStride;
+  using StrideDK = TensorStride;
+  using StrideDV = TensorStride;
+  using StrideDO = TensorStride;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideQ stride_Q;
+  StrideK stride_K;
+  StrideV stride_V;
+  StrideO stride_O;
+  StrideLSE stride_LSE;
+
+  StrideDQ stride_dQ;
+  StrideDK stride_dK;
+  StrideDV stride_dV;
+  StrideDO stride_dO;
+
+  uint64_t seed = 0;
+
+  DeviceAllocation<Element> block_Q;
+  DeviceAllocation<Element> block_K;
+  DeviceAllocation<Element> block_V;
+  DeviceAllocation<Element> block_O;
+  DeviceAllocation<ElementAccumulator> block_LSE;
+
+  DeviceAllocation<Element> block_dQ;
+  DeviceAllocation<Element> block_dK;
+  DeviceAllocation<Element> block_dV;
+  DeviceAllocation<Element> block_dO;
+
+  DeviceAllocation<Element> block_ref_dQ;
+  DeviceAllocation<Element> block_ref_dK;
+  DeviceAllocation<Element> block_ref_dV;
+
+  //
+  // Methods
+  //
+  bool verify(const ProblemShapeType& problem_shape) {
+    auto [Q, K, D, HB] = problem_shape;
+    auto [H, B] = HB;
+
+    Tensor mQ = make_tensor(make_gmem_ptr(block_Q.get()),
+      select<0,2,3>(problem_shape),
+      stride_Q);
+
+    Tensor mK = make_tensor(make_gmem_ptr(block_K.get()),
+      select<1,2,3>(problem_shape),
+      stride_K);
+
+    Tensor mV = make_tensor(make_gmem_ptr(block_V.get()),
+      select<1,2,3>(problem_shape),
+      stride_V);
+
+    Tensor mO = make_tensor(make_gmem_ptr(block_O.get()),
+      select<0,2,3>(problem_shape),
+      stride_O);
+
+    // keep going here! (this might be better in cursor)
+
+    Tensor mLSE = make_tensor(make_gmem_ptr(block_LSE.get()),
+      select<0,3>(problem_shape),
+      stride_LSE);
+
+    Tensor mDQ = make_tensor(make_gmem_ptr(block_ref_dQ.get()),
+      select<0,2,3>(problem_shape),
+      stride_dQ);
+
+    Tensor mDK = make_tensor(make_gmem_ptr(block_ref_dK.get()),
+      select<1,2,3>(problem_shape),
+      stride_dK);
+
+    Tensor mDV = make_tensor(make_gmem_ptr(block_ref_dV.get()),
+      select<1,2,3>(problem_shape),
+      stride_dV);
+
+    Tensor mDO = make_tensor(make_gmem_ptr(block_dO.get()),
+      select<0,2,3>(problem_shape),
+      stride_dO);
+
+    fmha_bwd_reference(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDQ, mDK, mDV, ActiveMask{});
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Reference kernel failed. Last CUDA error: "
+                << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    const double kMaxDiffThresh = sizeof(Element) == 1 ? 1e-0 : 1e-2;
+    const double kMeanDiffThresh = sizeof(Element) == 1 ? 1e-1 : 1e-3;
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    double max_diff = 0;
+    double mean_diff = 0;
+    reference_abs_diff(block_dQ, block_ref_dQ, max_diff, mean_diff);
+
+    bool passed_dQ = (max_diff < kMaxDiffThresh) && (mean_diff < kMeanDiffThresh);
+    if (! passed_dQ) {
+      std::cerr << "failed dQ: max diff " << max_diff 
+                << " mean " << mean_diff << std::endl;
+    }
+
+    reference_abs_diff(block_dK, block_ref_dK, max_diff, mean_diff);
+
+    bool passed_dK = (max_diff < kMaxDiffThresh) && (mean_diff < kMeanDiffThresh);
+    if (! passed_dK) {
+      std::cerr << "failed dK: max diff " << max_diff 
+                << " mean " << mean_diff << std::endl;
+    }
+
+    reference_abs_diff(block_dV, block_ref_dV, max_diff, mean_diff);
+
+    bool passed_dV = (max_diff < kMaxDiffThresh) && (mean_diff < kMeanDiffThresh);
+    if (! passed_dV) {
+      std::cerr << "failed dV: max diff " << max_diff 
+                << " mean " << mean_diff << std::endl;
+    }
+
+    return passed_dQ && passed_dK && passed_dV;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_shape, Options const& options) {
+    auto [Q, K, D, HB] = problem_shape;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+
+    auto shape_QO = select<0,2,3>(problem_shape);
+    auto shape_KV = select<1,2,3>(problem_shape);
+    auto shape_LSE = select<0,3>(problem_shape);
+
+    stride_Q = make_stride(D, _1{}, make_stride(D*Q, D*Q*H));
+    stride_K = make_stride(D, _1{}, make_stride(D*K, D*K*H));
+    stride_V = stride_K;
+    stride_O = stride_Q;
+    stride_LSE = make_stride(_1{}, make_stride(Q, Q*H));
+
+    stride_dQ = stride_Q;
+    stride_dK = stride_K;
+    stride_dV = stride_V;
+    stride_dO = stride_O;
+
+    auto lsize = [](auto shape) {
+      return size(make_shape(1ull, shape));
+    };
+
+    block_Q.reset(lsize(shape_QO));
+    block_K.reset(lsize(shape_KV));
+    block_V.reset(lsize(shape_KV));
+    block_O.reset(lsize(shape_QO));
+    block_LSE.reset(lsize(shape_LSE));
+
+    block_dQ.reset(lsize(shape_QO));
+    block_dK.reset(lsize(shape_KV));
+    block_dV.reset(lsize(shape_KV));
+    block_dO.reset(lsize(shape_QO));
+
+    block_ref_dQ.reset(lsize(shape_QO));
+    block_ref_dK.reset(lsize(shape_KV));
+    block_ref_dV.reset(lsize(shape_KV));
+
+    initialize_block(block_Q, seed + 2023, options.init_style_q);
+    initialize_block(block_K, seed + 2022, options.init_style_k);
+    initialize_block(block_V, seed + 2021, options.init_style_v);
+    initialize_block(block_dO, seed + 2020, options.init_style_do);
+
+    Tensor mQ = make_tensor(make_gmem_ptr(block_Q.get()),
+      select<0,2,3>(problem_shape),
+      stride_Q);
+
+    Tensor mK = make_tensor(make_gmem_ptr(block_K.get()),
+      select<1,2,3>(problem_shape),
+      stride_K);
+
+    Tensor mV = make_tensor(make_gmem_ptr(block_V.get()),
+      select<1,2,3>(problem_shape),
+      stride_V);
+
+    Tensor mO = make_tensor(make_gmem_ptr(block_O.get()),
+      select<0,2,3>(problem_shape),
+      stride_O);
+
+    Tensor mLSE = make_tensor(make_gmem_ptr(block_LSE.get()),
+      select<0,3>(problem_shape),
+      stride_LSE);
+
+    if (! options.skip_reference) {
+      fmha_reference(problem_shape, mQ, mK, mV, mO, mLSE, ActiveMask{});
+    }
+  }
+
+  ExampleResult run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    auto problem_shape = make_shape(options.q, options.k, options.d, make_shape(options.h, options.b));
+
+    initialize(problem_shape, options);
+
+    ElementAccumulator softmax_scale = 1.0f / sqrtf(options.d);
+
+    typename Operation::Arguments arguments{
+      problem_shape,
+      block_Q.get(), stride_Q,
+      block_K.get(), stride_K,
+      block_V.get(), stride_V,
+      block_O.get(), stride_O,
+      block_LSE.get(), stride_LSE,
+      block_dO.get(), stride_dO,
+      block_dQ.get(), stride_dQ,
+      block_dK.get(), stride_dK,
+      block_dV.get(), stride_dV,
+      softmax_scale,
+      hw_info
+    };
+
+    Operation op;
+
+    ExampleResult example_result;
+
+    example_result.smem_size = Operation::Kernel::SharedStorageSize;
+
+    size_t workspace_size = 0;
+    workspace_size = Operation::get_workspace_size(arguments);
+    DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = cutlass::Status::kSuccess;
+    status = op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "This kernel is not supported. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return example_result;
+    }
+
+    status = op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return example_result;
+    }
+
+    // Run
+    status = op.run();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return example_result;
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result) << std::endl;
+        return example_result;
+      }
+    }
+
+    // Record an event at the start of a series of GEMMs
+    result = cudaEventRecord(events[0]);
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    for (int i = 0; i < options.iterations; i++) {
+      status = op.run();
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: "
+                  << cudaGetErrorString(cudaGetLastError()) << std::endl;
+        return example_result;
+      }
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMMs are complete
+    result = cudaEventRecord(events[1]);
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    // Wait for work on the device to complete.
+    result = cudaEventSynchronize(events[1]);
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    runtime_ms /= static_cast<float>(options.iterations);
+
+    double flops = 10.0 * (std::is_same_v<ActiveMask, CausalMask> ? 0.5 : 1.0);
+    flops *= static_cast<double>(get<0>(problem_shape));
+    flops *= static_cast<double>(get<1>(problem_shape));
+    flops *= static_cast<double>(get<2>(problem_shape));
+    flops *= static_cast<double>(get<3,0>(problem_shape));
+    flops *= static_cast<double>(get<3,1>(problem_shape));
+    double tflops_s = flops * 1e-12 /*tera*/ / (runtime_ms * 1e-3 /*ms*/);
+    example_result.tflops_tc_s = tflops_s;
+    example_result.runtime_ms = runtime_ms;
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(result) << std::endl;
+      return example_result;
+    }
+
+    // Verify that the result is correct
+    bool passed = true;
+    if (options.verify) {
+      passed = verify(problem_shape);
+      if (passed) example_result.verified = true;
+    }
+    
+    if (!passed) {
+      std::cerr << "Reference check failed" << std::endl;
+      return example_result;
+    }
+
+    example_result.passed = true;
+
+    return example_result;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to print a description of the example run and its result
+void print_result(const std::string& description, ExampleResult result, bool verbose) {
+  std::ios fmt(nullptr);
+  fmt.copyfmt(std::cout);
+  std::cout << (result.passed ? (result.verified ? " [OK]  " : " [--] ") : "[FAIL] ");
+  std::cout << std::setw(32) << std::left << description;
+  std::cout.copyfmt(fmt);
+  std::cout << " : " << result.tflops_tc_s << " TFLOPS/s" << std::endl;
+  if (verbose) {
+    std::cout << "       t=" << result.runtime_ms << "ms, "
+        "smem=" << result.smem_size << "b" << std::endl;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct KernelCoop {};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class Mask>
+void run_bwd_64(Mask fusion, Options const & options, cutlass::KernelHardwareInfo const& hw_info) {
+  auto run = [&](auto shape, auto kernel, const char* name, auto... kernel_options) {
+    BwdRunner<decltype(shape), decltype(kernel), Mask, decltype(kernel_options)...> runner;
+    auto result = runner.run(options, hw_info);
+    print_result(name, result, options.verbose);
+  };
+
+  using HeadDim = _64;
+
+  run(Shape<_128, _128, HeadDim>{}, KernelCoop{}, "tma");
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class Mask>
+void run_bwd_128(Mask fusion, Options const & options, cutlass::KernelHardwareInfo const& hw_info) {
+  auto run = [&](auto shape, auto kernel, const char* name, auto... kernel_options) {
+    BwdRunner<decltype(shape), decltype(kernel), Mask, decltype(kernel_options)...> runner;
+    auto result = runner.run(options, hw_info);
+    print_result(name, result, options.verbose);
+  };
+
+  using HeadDim = _128;
+
+  run(Shape<_128, _128, HeadDim>{}, KernelCoop{}, "tma");
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main_single(int argc, char const **args) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 12 || props.major != 10) {
+    std::cout
+      << "This example requires a GPU of NVIDIA's Blackwell Architecture "
+      << "(compute capability 100a) and CUDA 12.8 or greater.\n";
+    return 0;
+  }
+  
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  if (options.sm_count == 0) {
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  }
+  else {
+    hw_info.sm_count = options.sm_count;
+  }
+
+  std::cout << "###### B " << options.b << " H " << options.h << " Q " << options.q << " K " << options.k << " D " << options.d << " ";
+  std::cout << "Backward" << " " << (options.causal ? "Causal" : "Full") << " ";
+  std::cout << "#SM " << hw_info.sm_count << std::endl;
+
+  auto with_causal = [&](auto fn) {
+    if (options.causal) {
+      fn(CausalMask{});
+    }
+    else {
+      fn(NoMask{});
+    }
+  };
+
+  with_causal([&](auto fusion) {
+    if (options.d <= 64) {
+      run_bwd_64(fusion, options, hw_info);
+    }
+    else if (options.d <= 128) {
+      run_bwd_128(fusion, options, hw_info);
+    }
+    else {
+      std::cout << "No kernel instantiated for d=" << options.d << std::endl;
+    }
+  });
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+  std::vector<std::string> full_arguments(args, args + argc);
+
+  int result = 0;
+
+  bool recursed = false;
+  for (size_t i = 1; i < full_arguments.size(); i++) {
+    if (full_arguments[i].find(',') != std::string::npos) {
+      auto arg = full_arguments[i];
+      size_t eq_pos = arg.find('=');
+      std::string prefix = eq_pos == std::string::npos ? "" : arg.substr(0, eq_pos+1);
+      std::string rest = eq_pos == std::string::npos ? arg : arg.substr(eq_pos+1);
+      for (;;) {
+        size_t comma_pos = rest.find(',');
+        std::string current = rest.substr(0, comma_pos);
+        full_arguments[i] = prefix + current;
+        std::vector<const char*> next_args;
+        for (auto& elem : full_arguments) { next_args.push_back(elem.data()); }
+        main(argc, next_args.data());
+        if (comma_pos == std::string::npos) break;
+        rest = rest.substr(comma_pos+1);
+      }
+      recursed = true;
+      break;
+    }
+  }
+
+  if (! recursed) {
+    main_single(argc, args);
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/77_blackwell_fmha/CMakeLists.txt b/examples/77_blackwell_fmha/CMakeLists.txt
index bff609fa..f04ebe41 100644
--- a/examples/77_blackwell_fmha/CMakeLists.txt
+++ b/examples/77_blackwell_fmha/CMakeLists.txt
@@ -28,16 +28,14 @@
 
 
 set_property(
-  SOURCE 77_blackwell_fmha.cu
-  PROPERTY COMPILE_FLAGS "--use_fast_math -ftemplate-backtrace-limit=0")
-
-set_property(
-  SOURCE 77_blackwell_fmha_gen.cu
-  PROPERTY COMPILE_FLAGS "--use_fast_math -ftemplate-backtrace-limit=0")
-
-set_property(
-  SOURCE 77_blackwell_mla.cu
-  PROPERTY COMPILE_FLAGS "--use_fast_math -ftemplate-backtrace-limit=0")
+  SOURCE
+      77_blackwell_fmha.cu
+      77_blackwell_fmha_gen.cu
+      77_blackwell_mla.cu
+      77_blackwell_fmha_bwd.cu
+  PROPERTY
+      COMPILE_FLAGS "--use_fast_math -ftemplate-backtrace-limit=0"
+)
 
 set(TEST_BASIC --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=no)
 set(TEST_CAUSAL --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=causal)
@@ -116,5 +114,34 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
     target_compile_definitions(77_blackwell_mla_b2b_2sm_${PREC} PRIVATE ${PREC_MACRO} B2B)
     target_compile_options(77_blackwell_mla_b2b_2sm_${PREC} PRIVATE -Xptxas -v)
 
+    cutlass_example_add_executable(
+        77_blackwell_fmha_bwd_${PREC}
+        77_blackwell_fmha_bwd.cu
+        TEST_COMMAND_OPTIONS
+        TEST_BASIC
+        # TEST_GEN_VARLEN
+        # TEST_GEN_HDIM64
+        # TEST_GEN_GQA
+        # TEST_GEN_REMAP
+        # TEST_GEN_CACHEONLY)
+        )
+    target_include_directories(77_blackwell_fmha_bwd_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    target_compile_definitions(77_blackwell_fmha_bwd_${PREC} PRIVATE ${PREC_MACRO})
+    target_compile_options(77_blackwell_fmha_bwd_${PREC} PRIVATE -Xptxas -v)
+
+    cutlass_example_add_executable(
+        77_blackwell_fmha_bwd_sat_${PREC}
+        77_blackwell_fmha_bwd.cu
+        TEST_COMMAND_OPTIONS
+        TEST_BASIC
+        # TEST_GEN_VARLEN
+        TEST_GEN_HDIM64
+        # TEST_GEN_GQA
+        # TEST_GEN_REMAP
+        # TEST_GEN_CACHEONLY)
+        )
+    target_include_directories(77_blackwell_fmha_bwd_sat_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    target_compile_definitions(77_blackwell_fmha_bwd_sat_${PREC} PRIVATE ${PREC_MACRO} SKIP_ATOMIC)
+    target_compile_options(77_blackwell_fmha_bwd_sat_${PREC} PRIVATE -Xptxas -v)
   endforeach()
 endif()
diff --git a/examples/77_blackwell_fmha/README.md b/examples/77_blackwell_fmha/README.md
index c8250a7d..a1536dc8 100644
--- a/examples/77_blackwell_fmha/README.md
+++ b/examples/77_blackwell_fmha/README.md
@@ -22,6 +22,21 @@ The `apply_mask` function is called with the accumulator of the first GEMM and t
 It is well-suited for applying masks or activations.
 More complex fusions that require memory loads would require modifying the mainloop collective to orchestrate the load via TMA.
 
+# FMHA for Blackwell: Backward
+
+This sample provides code for fused multi-head attention backward pass.
+It supports HeadDims of 64 and 128, and fp8, fp16, and bf16 input data types.
+The blocking in sequence length Q and K is 128, loads are done via TMA.
+We support causal masking.
+The structure of this code is very similar to the forward pass, and the techniques are analogous.
+
+There are three kernels to compute backwards:
+1. `FmhaKernelBwdSumOdO` to compute the sum of the outer product of O and dO.
+3. `Sm100FmhaBwdKernelTmaWarpSpecialized` to compute the backward pass.
+2. `FmhaKernelBwdConvert` to convert the dQ from fp32 to the final output precision.
+
+`Sm100FmhaBwdKernelTmaWarpSpecialized` is the main point of this sample, as it demonstrates how to use tensor cores to achieve a high performance fused kernel.
+
 # MLA Inference for Blackwell
 
 This sample provides code for fused multi-head latent attention inference in
diff --git a/examples/77_blackwell_fmha/device/fmha_device_bwd.hpp b/examples/77_blackwell_fmha/device/fmha_device_bwd.hpp
new file mode 100644
index 00000000..80fcdf9f
--- /dev/null
+++ b/examples/77_blackwell_fmha/device/fmha_device_bwd.hpp
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/tensor.hpp"
+
+#include "../device/fmha.hpp"
+#include "../kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp"
+#include "../kernel/fmha_kernel_bwd_sum_OdO.hpp"
+#include "../kernel/fmha_kernel_bwd_convert.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::fmha::device {
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template<class Element, class ElementAccumulator, class TileShape, class Mask>
+class Sm100FmhaBwd {
+public:
+  /// Argument structure: User API
+  struct Arguments {
+    // Q K D HB
+    cute::tuple<int, int, int, cute::tuple<int, int>> problem_size;
+
+    const Element* ptr_Q;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_Q;
+    const Element* ptr_K;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_K;
+    const Element* ptr_V;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_V;
+
+    const Element* ptr_O;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_O;
+    const ElementAccumulator* ptr_LSE;
+    cute::tuple<cute::_1, cute::tuple<int, int>> stride_LSE;
+
+    const Element* ptr_dO;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_dO;
+
+    Element* ptr_dQ;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_dQ;
+    Element* ptr_dK;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_dK;
+    Element* ptr_dV;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_dV;
+
+    ElementAccumulator softmax_scale;
+
+    cutlass::KernelHardwareInfo hw_info;
+  };
+
+  using OperationSumOdO = cutlass::fmha::device::FMHA<
+    cutlass::fmha::kernel::FmhaKernelBwdSumOdO<Element, ElementAccumulator>
+  >;
+  using OperationConvert = cutlass::fmha::device::FMHA<
+    cutlass::fmha::kernel::FmhaKernelBwdConvert<Element, ElementAccumulator>
+  >;
+
+  using Operation = cutlass::fmha::device::FMHA<
+    cutlass::fmha::kernel::Sm100FmhaBwdKernelTmaWarpSpecialized<Element, ElementAccumulator, TileShape, Mask>
+  >;
+  using Kernel = typename Operation::Kernel;
+
+  struct Params {
+    OperationSumOdO op_sum_OdO;
+    Operation op;
+    OperationConvert op_convert;
+    ElementAccumulator* dQ_acc;
+    size_t dQ_acc_size;
+  };
+
+private:
+  Params params_;
+
+  static typename OperationSumOdO::Arguments to_sum_OdO_arguments(
+        Arguments const& args,
+        ElementAccumulator* sum_odo = nullptr,
+        ElementAccumulator* scaled_lse = nullptr) {
+    using namespace cute;
+    auto [Q, K, D, HB] = args.problem_size;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+    auto stride_sum_OdO = make_stride(_1{}, make_stride(Q, Q*H));
+    auto stride_scaled_lse = make_stride(_1{}, make_stride(Q, Q*H));
+    auto log2_e = log2f(expf(1.0f));
+    return typename OperationSumOdO::Arguments {
+      args.problem_size,
+      args.ptr_O, args.stride_O,
+      args.ptr_dO, args.stride_dO,
+      sum_odo, stride_sum_OdO,
+      args.ptr_LSE, args.stride_LSE,
+      scaled_lse, stride_scaled_lse,
+      -1.0f, -log2_e
+    };
+  }
+
+  static typename OperationConvert::Arguments to_convert_arguments(Arguments const& args, ElementAccumulator* src = nullptr) {
+    using namespace cute;
+    auto [Q, K, D, HB] = args.problem_size;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+    auto stride_src_dQ = make_stride(D, _1{}, make_stride(D*Q, D*Q*H));
+    return typename OperationConvert::Arguments {
+      args.problem_size,
+      src, stride_src_dQ,
+      nullptr, stride_src_dQ,
+      nullptr, stride_src_dQ,
+      args.ptr_dQ, args.stride_dQ,
+      nullptr, args.stride_dK,
+      nullptr, args.stride_dV,
+      args.softmax_scale
+    };
+  }
+
+  static typename Operation::Arguments to_bwd_arguments(
+      Arguments const& args,
+      ElementAccumulator* sum_OdO = nullptr, cute::tuple<cute::_1, cute::tuple<int, int>> const& stride_sum_OdO = {},
+      ElementAccumulator* scaled_lse = nullptr, cute::tuple<cute::_1, cute::tuple<int, int>> const& stride_scaled_lse = {},
+      ElementAccumulator* dQ_acc = nullptr, cute::tuple<int, cute::_1, cute::tuple<int, int>> const& stride_dQ = {}) {
+    return typename Operation::Arguments{
+      args.problem_size,
+      { args.ptr_Q,  args.stride_Q,
+        args.ptr_K,  args.stride_K,
+        args.ptr_V,  args.stride_V,
+        args.ptr_dO, args.stride_dO,
+        scaled_lse, stride_scaled_lse,
+        sum_OdO, stride_sum_OdO,
+        dQ_acc, stride_dQ,
+        args.softmax_scale },
+      { args.ptr_dK, args.stride_dK,
+        args.ptr_dV, args.stride_dV },
+      args.hw_info
+    };
+  }
+
+public:
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    Status status = Status::kSuccess;
+
+    status = OperationSumOdO::can_implement(to_sum_OdO_arguments(args));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = OperationConvert::can_implement(to_convert_arguments(args));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operation::can_implement(to_bwd_arguments(args));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    auto [Q, K, D, HB] = args.problem_size;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+    size_t workspace_bytes = 0;
+    // OdO vector
+    workspace_bytes += B*H*Q * sizeof(ElementAccumulator);
+    // scaled LSE vector
+    workspace_bytes += B*H*Q * sizeof(ElementAccumulator);
+    // FP32 versions of outputs that are churned (start off with Q only)
+    workspace_bytes += B*H*Q*D * sizeof(ElementAccumulator);
+    return workspace_bytes;
+  }
+
+  /// Initializes state from arguments.
+  Status
+  initialize_split(Arguments const& args, void* workspace_dQ, void* workspace_sum_OdO, void* workspace_scaled_lse, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("Universal::initialize_split() - workspace_dQ="
+      << workspace_dQ << ", workspace_sum_OdO=" << workspace_sum_OdO << "stream: " << (stream ? "non-null" : "null"));
+
+    auto [Q, K, D, HB] = args.problem_size;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+    ElementAccumulator* sum_OdO = reinterpret_cast<ElementAccumulator*>(workspace_sum_OdO);
+    ElementAccumulator* scaled_lse = reinterpret_cast<ElementAccumulator*>(workspace_scaled_lse);
+    ElementAccumulator* dQ_acc = reinterpret_cast<ElementAccumulator*>(workspace_dQ);
+    params_.dQ_acc = dQ_acc;
+    params_.dQ_acc_size = B*H*Q*D * sizeof(ElementAccumulator);
+    auto args_sum_OdO = to_sum_OdO_arguments(args, sum_OdO, scaled_lse);
+    auto args_convert = to_convert_arguments(args, dQ_acc);
+    params_.op_sum_OdO.initialize(args_sum_OdO, nullptr, stream);
+    params_.op_convert.initialize(args_convert, nullptr, stream);
+    auto args_bwd = to_bwd_arguments(
+        args, sum_OdO, args_sum_OdO.stride_sum_OdO,
+        scaled_lse, args_sum_OdO.stride_scaled_lse,
+        dQ_acc, args_convert.stride_src_dQ
+    );
+    params_.op.initialize(args_bwd, nullptr, stream);
+
+    return Status::kSuccess;
+  }
+
+  /// Initializes state from arguments.
+  Status
+  initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("Universal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    auto [Q, K, D, HB] = args.problem_size;
+    auto [H, B] = HB;
+    D = cutlass::round_up(D, 8);  // Alignment
+    Q = cutlass::round_up(Q, 8);  // Alignment
+    char* workspace_chr = reinterpret_cast<char*>(workspace);
+    ElementAccumulator* sum_OdO = reinterpret_cast<ElementAccumulator*>(workspace_chr);
+    workspace_chr += B*H*Q * sizeof(ElementAccumulator);
+    ElementAccumulator* scaled_lse = reinterpret_cast<ElementAccumulator*>(workspace_chr);
+    workspace_chr += B*H*Q * sizeof(ElementAccumulator);
+    ElementAccumulator* dQ_acc = reinterpret_cast<ElementAccumulator*>(workspace_chr);
+    return initialize_split(args, dQ_acc, sum_OdO, scaled_lse, stream);
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("FmhaDeviceBwd::run()");
+
+    Status result = Status::kSuccess;
+    result = params.op_sum_OdO.run(stream);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    auto cuda_result = cudaMemsetAsync(params.dQ_acc, 0, params.dQ_acc_size, stream);
+    if (cuda_result != cudaSuccess) {
+       return Status::kErrorInternal;
+    }
+
+    result = params.op.run(stream);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    result = params.op_convert.run(stream);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    return Status::kSuccess;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream);
+    }
+    return status;
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_convert.hpp b/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_convert.hpp
new file mode 100644
index 00000000..c2618bcb
--- /dev/null
+++ b/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_convert.hpp
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/layout.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+
+template<class Element, class ElementAcc>
+struct FmhaKernelBwdConvert {
+
+  struct Arguments {
+    tuple<int, int, int, tuple<int, int>> problem_size;
+
+    const ElementAcc* ptr_src_dQ;
+    tuple<int, _1, tuple<int, int>> stride_src_dQ;
+    const ElementAcc* ptr_src_dK;
+    tuple<int, _1, tuple<int, int>> stride_src_dK;
+    const ElementAcc* ptr_src_dV;
+    tuple<int, _1, tuple<int, int>> stride_src_dV;
+    
+    Element* ptr_dest_dQ;
+    tuple<int, _1, tuple<int, int>> stride_dest_dQ;
+    Element* ptr_dest_dK;
+    tuple<int, _1, tuple<int, int>> stride_dest_dK;
+    Element* ptr_dest_dV;
+    tuple<int, _1, tuple<int, int>> stride_dest_dV;
+
+    ElementAcc scale = 1.0;
+  };
+
+  using Params = Arguments;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  static constexpr int SharedStorageSize = 0;
+
+  static const int MinBlocksPerMultiprocessor = 1;
+  static const int MaxThreadsPerBlock = 128;
+  using ArchTag = cutlass::arch::Sm90;
+
+  static const int kBlockSeq = 8;
+
+  static size_t get_workspace_size(Arguments const& args) { return 0; }
+  static cutlass::Status initialize_workspace(Arguments const&, void*, cudaStream_t) {
+    return cutlass::Status::kSuccess;
+  }
+
+  static const int kNumThreadsD = 16;
+  static const int kNumThreadsSeq = MaxThreadsPerBlock / kNumThreadsD;
+  static const int kElementsPerLoad = 4;
+
+  static const int kIterationsSeq = kBlockSeq / kNumThreadsSeq;
+
+  static bool can_implement(Arguments const& args) {
+    return get<2>(args.problem_size) % kElementsPerLoad == 0;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    dim3 grid(size<3,0>(params.problem_size), size<3,1>(params.problem_size), ceil_div(std::max(size<0>(params.problem_size), size<1>(params.problem_size)), kBlockSeq));
+    return grid;
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(kNumThreadsD, kNumThreadsSeq, 1);
+    return block;
+  }
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template<class StrideSrc, class StrideDest>
+  CUTLASS_DEVICE void copy(Params const& params, const ElementAcc* ptr_src, StrideSrc const& stride_src, Element* ptr_dest, StrideDest const& stride_dest, int count) {
+    auto ptr_src_bh = ptr_src + get<2,0>(stride_src) * blockIdx.x + get<2,1>(stride_src) * blockIdx.y;
+    auto ptr_dest_bh = ptr_dest + get<2,0>(stride_dest) * blockIdx.x + get<2,1>(stride_dest) * blockIdx.y;
+
+    for (int idx_s_t = threadIdx.y; idx_s_t < kBlockSeq; idx_s_t += kNumThreadsSeq) {
+      int idx_s = idx_s_t + kBlockSeq * blockIdx.z;
+      if (idx_s >= count) continue;
+      auto ptr_src_bhs = ptr_src_bh + idx_s * get<0>(stride_src);
+      auto ptr_dest_bhs = ptr_dest_bh + idx_s * get<0>(stride_dest);
+
+      for (int idx_d = threadIdx.x * kElementsPerLoad; idx_d < get<2>(params.problem_size); idx_d += kElementsPerLoad * kNumThreadsD) {
+        ElementAcc value_src[kElementsPerLoad];
+        Element value_dest[kElementsPerLoad];
+        
+        using VecSrc = uint_bit_t<sizeof_bits_v<ElementAcc> * kElementsPerLoad>;
+        using VecDest = uint_bit_t<sizeof_bits_v<Element> * kElementsPerLoad>;
+        *reinterpret_cast<VecSrc*>(value_src) = *reinterpret_cast<const VecSrc*>(&ptr_src_bhs[idx_d]);
+
+        for (int v = 0; v < kElementsPerLoad; v++) {
+          value_dest[v] = static_cast<Element>(params.scale * value_src[v]);
+        }
+
+        *reinterpret_cast<VecDest*>(&ptr_dest_bhs[idx_d]) = *reinterpret_cast<const VecDest*>(value_dest);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void operator()(const Params &params, char* smem) {
+    if (params.ptr_src_dQ != nullptr) {
+      copy(params, params.ptr_src_dQ, params.stride_src_dQ, params.ptr_dest_dQ, params.stride_dest_dQ, get<0>(params.problem_size));
+    }
+    if (params.ptr_src_dK != nullptr) {
+      copy(params, params.ptr_src_dK, params.stride_src_dK, params.ptr_dest_dK, params.stride_dest_dK, get<1>(params.problem_size));
+    }
+    if (params.ptr_src_dV != nullptr) {
+      copy(params, params.ptr_src_dV, params.stride_src_dV, params.ptr_dest_dV, params.stride_dest_dV, get<1>(params.problem_size));
+    }
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
diff --git a/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_sum_OdO.hpp b/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_sum_OdO.hpp
new file mode 100644
index 00000000..44080e2d
--- /dev/null
+++ b/examples/77_blackwell_fmha/kernel/fmha_kernel_bwd_sum_OdO.hpp
@@ -0,0 +1,151 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/layout.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+
+template<class Element, class ElementAcc>
+struct FmhaKernelBwdSumOdO {
+
+  struct Arguments {
+    cute::tuple<int, int, int, cute::tuple<int, int>> problem_size;
+
+    const Element* ptr_O;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_O;
+    const Element* ptr_dO;
+    cute::tuple<int, cute::_1, cute::tuple<int, int>> stride_dO;
+
+    ElementAcc* ptr_sum_OdO;
+    cute::tuple<cute::_1, cute::tuple<int, int>> stride_sum_OdO;
+
+    const ElementAcc* ptr_lse = nullptr;
+    cute::tuple<cute::_1, cute::tuple<int, int>> stride_lse;
+
+    ElementAcc* ptr_scaled_lse = nullptr;
+    cute::tuple<cute::_1, cute::tuple<int, int>> stride_scaled_lse;
+
+    ElementAcc sum_odo_scale = 1.0;
+    ElementAcc lse_scale = 1.0;
+  };
+
+  using Params = Arguments;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  static constexpr int SharedStorageSize = 0;
+
+  static const int MinBlocksPerMultiprocessor = 1;
+  static const int MaxThreadsPerBlock = 128;
+  using ArchTag = cutlass::arch::Sm100;
+
+  static size_t get_workspace_size(Arguments const& args) { return 0; }
+  static cutlass::Status initialize_workspace(Arguments const&, void*, cudaStream_t) {
+    return cutlass::Status::kSuccess;
+  }
+
+  static const int kBlockQ = 16;
+
+  static const int kNumThreadsD = 8;
+  static const int kNumThreadsQ = MaxThreadsPerBlock / kNumThreadsD;
+  static const int kElementsPerLoad = 2;
+
+  static const int kIterationsQ = kBlockQ / kNumThreadsQ;
+
+  static bool can_implement(Arguments const& args) {
+    return get<2>(args.problem_size) % kElementsPerLoad == 0;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    dim3 grid(ceil_div(size<0>(params.problem_size), kBlockQ), size<3,0>(params.problem_size), size<3,1>(params.problem_size));
+    return grid;
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(kNumThreadsD, kNumThreadsQ, 1);
+    return block;
+  }
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  CUTLASS_DEVICE void operator()(const Params &params, char* smem) {
+    auto ptr_O_bh = params.ptr_O + blockIdx.y * get<2,0>(params.stride_O) + blockIdx.z * get<2,1>(params.stride_O);
+    auto ptr_dO_bh = params.ptr_dO + blockIdx.y * get<2,0>(params.stride_dO) + blockIdx.z * get<2,1>(params.stride_dO);
+    auto ptr_sum_OdO_bh = params.ptr_sum_OdO + blockIdx.y * get<1,0>(params.stride_sum_OdO) + blockIdx.z * get<1,1>(params.stride_sum_OdO);
+    auto ptr_lse_bh = params.ptr_lse + blockIdx.y * get<1,0>(params.stride_lse) + blockIdx.z * get<1,1>(params.stride_lse);
+    auto ptr_scaled_lse_bh = params.ptr_scaled_lse + blockIdx.y * get<1,0>(params.stride_scaled_lse) + blockIdx.z * get<1,1>(params.stride_scaled_lse);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_q_t = threadIdx.y; idx_q_t < kBlockQ; idx_q_t += kNumThreadsQ) {
+      int idx_q = idx_q_t + kBlockQ * blockIdx.x;
+      if (idx_q >= get<0>(params.problem_size)) continue;
+      ElementAcc acc = 0;
+      auto ptr_O_bhq = ptr_O_bh + idx_q * get<0>(params.stride_O);
+      auto ptr_dO_bhq = ptr_dO_bh + idx_q * get<0>(params.stride_dO);
+      auto ptr_sum_OdO_bhq = ptr_sum_OdO_bh + idx_q * get<0>(params.stride_sum_OdO);
+      auto ptr_lse_bhq = ptr_lse_bh + idx_q * get<0>(params.stride_lse);
+      auto ptr_scaled_lse_bhq = ptr_scaled_lse_bh + idx_q * get<0>(params.stride_scaled_lse);
+
+      for (int idx_d = threadIdx.x * kElementsPerLoad; idx_d < get<2>(params.problem_size); idx_d += kElementsPerLoad * kNumThreadsD) {
+        Element value_O[kElementsPerLoad];
+        Element value_dO[kElementsPerLoad];
+        
+        using Vec = uint_bit_t<sizeof_bits_v<Element> * kElementsPerLoad>;
+        *reinterpret_cast<Vec*>(value_O) = *reinterpret_cast<const Vec*>(&ptr_O_bhq[idx_d]);
+        *reinterpret_cast<Vec*>(value_dO) = *reinterpret_cast<const Vec*>(&ptr_dO_bhq[idx_d]);
+
+        for (int v = 0; v < kElementsPerLoad; v++) {
+          acc += value_O[v] * value_dO[v];
+        }
+      }
+
+      for (int i = 1; i < kNumThreadsD; i *= 2) {
+        acc += __shfl_xor_sync((uint32_t)-1, acc, i, kNumThreadsD);
+      }
+
+      if (threadIdx.x == 0) {
+        *ptr_sum_OdO_bhq = params.sum_odo_scale * acc;
+        if (params.ptr_scaled_lse) {
+          *ptr_scaled_lse_bhq = params.lse_scale * *ptr_lse_bhq;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
diff --git a/examples/77_blackwell_fmha/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp
new file mode 100644
index 00000000..e1bd43d5
--- /dev/null
+++ b/examples/77_blackwell_fmha/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp
@@ -0,0 +1,1699 @@
+/***************************************************************************************************
+ * Copyright (c) 2025  - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/simd_sm100.hpp"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "collective/fmha_common.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cutlass::fmha::collective;
+
+using namespace cute;
+
+template<
+    class Element,
+    class ElementAcc,
+    class TileShape,
+    class Mask
+>
+struct Sm100FmhaBwdKernelTmaWarpSpecialized {
+
+  using TileShapeQ = decltype(get<0>(TileShape{}));
+  static_assert(std::is_same_v<TileShapeQ, _128>, "tile shape K must be 128");
+  using TileShapeK = decltype(get<1>(TileShape{}));
+  static_assert(std::is_same_v<TileShapeK, _128>, "tile shape K must be 128");
+  using TileShapeDQK = decltype(get<2>(TileShape{}));
+  using TileShapeDVO = decltype(get<2>(TileShape{}));
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+  struct TmemAllocation {
+    static constexpr uint32_t kDK = 0;                     // TileShapeK x TileShapeDQK x acc
+    static constexpr uint32_t kDV = kDK + TileShapeDQK{};  // TileShapeK x TileShapeDVO x acc
+    static constexpr uint32_t kDQ = kDV + TileShapeDVO{};  // TileShapeQ x TileShapeDQK x acc
+    static constexpr uint32_t kDP = kDQ;                   // TileShapeK x TileShapeQ   x inp
+    static constexpr uint32_t kS = kDQ + max(TileShapeQ{}, TileShapeDQK{});
+    static constexpr uint32_t kP = kS;
+    static constexpr uint32_t kTotal = kS + TileShapeQ{};
+  };
+
+  static_assert(
+      static_cast<int>(TmemAllocation::kTotal) <= TmemAllocator::Sm100TmemCapacityColumns,
+      "using too much tmem"
+  );
+
+  enum class WarpRole {
+    Empty = 0x0, Load = 0x1, Mma = 0x2, Compute = 0x3, Reduce = 0x4
+  };
+
+  static constexpr unsigned long long kWarpAssignment = 0x12'3333'3333'4444ull;
+  static constexpr int kNumComputeWarps = 8;
+  static constexpr int kNumReduceWarps = 4;
+  CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) {
+    return static_cast<WarpRole>((kWarpAssignment >> (4 * warp_idx)) & 0xF);
+  }
+
+  struct RegisterAllocation {
+    static constexpr int kWarpgroup0 = 160-8;
+    static constexpr int kWarpgroup1 = 128;
+    static constexpr int kWarpgroup2 = 96;
+    static constexpr int kReduce = kWarpgroup0;
+    static constexpr int kCompute = kWarpgroup1;
+    static constexpr int kMma = kWarpgroup2;
+    static constexpr int kEmpty = kWarpgroup2;
+    static constexpr int kLoad = kWarpgroup2;
+
+    static_assert(kWarpgroup0 + 2 * kWarpgroup1 + kWarpgroup2 <= 512);
+  };
+
+  using ArchTag = cutlass::arch::Sm100;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Schedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
+
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  static constexpr int kNumWarps = kNumComputeWarps + kNumReduceWarps + 4;
+  static constexpr int MaxThreadsPerBlock = NumThreadsPerWarp * kNumWarps;
+
+  static constexpr int Alignment = 128 / sizeof_bits_v<Element>;
+  static constexpr int kStages = 2;
+
+  using TensorStrideContiguousK = Stride<int, _1, Stride<int, int>>;
+  using TensorStrideContiguousMN = Stride<_1, int, Stride<int, int>>;
+  
+  // compute S
+  using CollectiveMmaKQ = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStrideContiguousK, Alignment,
+      Element, TensorStrideContiguousK, Alignment,
+      ElementAcc,
+      Shape<TileShapeK, TileShapeQ, TileShapeDQK>,
+      ClusterShape, cutlass::gemm::collective::StageCount<kStages>,
+      Schedule>::CollectiveOp;
+  using TileShapeKQ = typename CollectiveMmaKQ::TileShape;
+  using TiledMmaKQ = typename CollectiveMmaKQ::TiledMma;
+
+  // compute dP
+  using CollectiveMmaVDO = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStrideContiguousK, Alignment,
+      Element, TensorStrideContiguousK, Alignment,
+      ElementAcc,
+      Shape<TileShapeK, TileShapeQ, TileShapeDVO>,
+      ClusterShape, cutlass::gemm::collective::StageCount<kStages>,
+      Schedule>::CollectiveOp;
+  using TileShapeVDO = typename CollectiveMmaVDO::TileShape;
+  using TiledMmaVDO = typename CollectiveMmaVDO::TiledMma;
+
+  // compute dV
+  using CollectiveMmaPDO = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      // needs to match ordering of S calculation
+      Element, TensorStrideContiguousK, Alignment,
+      Element, TensorStrideContiguousMN, Alignment,
+      ElementAcc,
+      Shape<TileShapeK, TileShapeDVO, TileShapeQ>,
+      ClusterShape, cutlass::gemm::collective::StageCount<kStages>,
+      Schedule>::CollectiveOp;
+  using TileShapePDO = typename CollectiveMmaPDO::TileShape;
+  using TiledMmaPDO = decltype(to_tiled_mma_sm100_ts(typename CollectiveMmaPDO::TiledMma{}));
+
+  // compute dK
+  using CollectiveMmaDSQ = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      // somewhat arbitrary since we dump to smem, need to agree with the next one
+      Element, TensorStrideContiguousK , Alignment,
+      Element, TensorStrideContiguousMN, Alignment,
+      ElementAcc,
+      Shape<TileShapeK, TileShapeDQK, TileShapeQ>,
+      ClusterShape, cutlass::gemm::collective::StageCount<kStages>,
+      Schedule>::CollectiveOp;
+  using TileShapeDSQ = typename CollectiveMmaDSQ::TileShape;
+  using TiledMmaDSQ = typename CollectiveMmaDSQ::TiledMma;
+
+  // compute dQ
+  using CollectiveMmaDSK = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      // somewhat arbitrary since we dump to smem, need to agree with the previous one
+      Element, TensorStrideContiguousMN, Alignment,
+      Element, TensorStrideContiguousMN, Alignment,
+      ElementAcc,
+      Shape<TileShapeQ, TileShapeDQK, TileShapeK>,
+      ClusterShape, cutlass::gemm::collective::StageCount<kStages>,
+      Schedule>::CollectiveOp;
+  using TileShapeDSK = typename CollectiveMmaDSK::TileShape;
+  using TiledMmaDSK = typename CollectiveMmaDSK::TiledMma;
+
+  // pipelines are named Pipeline<Producer><Consumer><Resource>
+  static constexpr int kStagesComputeSmem = 1;
+  using PipelineLoadMmaQ = PipelineTmaUmmaAsync<2, ClusterShape>;
+  using PipelineLoadMmaDO = PipelineTmaUmmaAsync<1, ClusterShape>;
+  using PipelineLoadComputeLSE = PipelineAsync<1>;
+  using PipelineLoadComputeSumOdO = PipelineAsync<1>;
+  using PipelineMmaComputeS = PipelineUmmaAsync<1>;
+  using PipelineMmaComputeDP = PipelineUmmaAsync<1>;
+  using PipelineMmaReduceDQ = PipelineUmmaAsync<1>;
+  using PipelineComputeMmaP = PipelineUmmaConsumerAsync<1>;
+  using PipelineComputeMmaDS = PipelineUmmaConsumerAsync<kStagesComputeSmem>;
+  using PipelineMmaComputeDKDV = PipelineUmmaAsync<2>;
+  static constexpr int kStagesReduceTmaStore = 2;
+  using PipelineReduceTmaStore = PipelineTmaStore<kStagesReduceTmaStore>;
+
+  struct PipelineStorage {
+    alignas(16) typename PipelineLoadMmaQ::SharedStorage load_mma_q;
+    alignas(16) typename PipelineLoadMmaDO::SharedStorage load_mma_do;
+    alignas(16) typename PipelineLoadComputeLSE::SharedStorage load_compute_lse;
+    alignas(16) typename PipelineLoadComputeSumOdO::SharedStorage load_compute_sum_odo;
+    alignas(16) typename PipelineMmaComputeS::SharedStorage mma_compute_s;
+    alignas(16) typename PipelineMmaComputeDP::SharedStorage mma_compute_dp;
+    alignas(16) typename PipelineMmaReduceDQ::SharedStorage mma_reduce_dq;
+    alignas(16) typename PipelineComputeMmaP::SharedStorage compute_mma_p;
+    alignas(16) typename PipelineComputeMmaDS::SharedStorage compute_mma_ds;
+    alignas(16) typename PipelineMmaComputeDKDV::SharedStorage mma_compute_dkdv;
+  };
+
+  template<class Layout, class Stages = _1>
+  static CUTE_DEVICE constexpr auto restage(Layout const& layout, Stages stages = {}) {
+    return composition(layout, make_tuple(_, _, _, make_layout(stages)));
+  }
+
+  using SmemLayoutK = decltype(restage(typename CollectiveMmaKQ::SmemLayoutA{}));
+  using SmemLayoutV = decltype(restage(typename CollectiveMmaVDO::SmemLayoutA{}));
+  using SmemLayoutQ = decltype(restage(typename CollectiveMmaKQ::SmemLayoutB{}, _2{}));
+  using SmemLayoutDO = decltype(restage(typename CollectiveMmaVDO::SmemLayoutB{}, _1{}));
+  using SmemLayoutDS = decltype(restage(typename CollectiveMmaDSK::SmemLayoutA{}, Int<kStagesComputeSmem>{}));
+  using SmemLayoutLSE = Layout<Shape<TileShapeQ, _1>>;
+  using SmemLayoutSumOdO = Layout<Shape<TileShapeQ, _1>>;
+
+  using SmemLayoutQT = decltype(restage(typename CollectiveMmaDSQ::SmemLayoutB{}, _2{}));
+  using SmemLayoutKT = decltype(restage(typename CollectiveMmaDSK::SmemLayoutB{}));
+  using SmemLayoutDST = decltype(restage(typename CollectiveMmaDSQ::SmemLayoutA{}, Int<kStagesComputeSmem>{}));
+  using SmemLayoutDOT = decltype(restage(typename CollectiveMmaPDO::SmemLayoutB{}, _1{}));
+
+  using TileShapeDQ = _32;
+  using SmemAtomDQ = decltype(cutlass::gemm::collective::detail::sm100_smem_selector<
+      cute::UMMA::Major::K, ElementAcc, TileShapeQ, TileShapeDQ
+  >());
+  using SmemShapeDQ = Shape<TileShapeQ, TileShapeDQ, Int<kStagesReduceTmaStore>>;
+  using SmemLayoutDQ = decltype(tile_to_shape(SmemAtomDQ{}, SmemShapeDQ{}, Step<_2, _1, _3>{}));
+
+  struct TensorStorage {
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutKT>> smem_k_t;
+    };
+    alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutQT>> smem_q_t;
+    };
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutDO>> smem_do;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutDOT>> smem_do_t;
+    };
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutDS>> smem_ds;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutDST>> smem_ds_t;
+    };
+    alignas(1024) cute::array<ElementAcc, cute::cosize_v<SmemLayoutDQ>> smem_dq;
+    alignas(16) cute::array<ElementAcc, cute::cosize_v<SmemLayoutLSE>> smem_lse;
+    alignas(16) cute::array<ElementAcc, cute::cosize_v<SmemLayoutSumOdO>> smem_sum_odo;
+  };
+
+  static constexpr int kTransactionsBytesLoadQ = cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutQ{})) * cute::sizeof_bits_v<Element>);
+  static constexpr int kTransactionsBytesLoadDO = cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutDO{})) * cute::sizeof_bits_v<Element>);
+
+  static constexpr int kTransactionsBytesLoadK = cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutK{})) * cute::sizeof_bits_v<Element>);
+  static constexpr int kTransactionsBytesLoadV = cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutV{})) * cute::sizeof_bits_v<Element>);
+
+  struct SharedStorage {
+    TensorStorage tensors;
+    PipelineStorage pipelines;
+    uint32_t tmem_base_ptr;
+  };
+
+  // this is tight enough that it won't work with sizeof due to padding for alignment
+  static constexpr int SharedStorageSize = offsetof(SharedStorage, tmem_base_ptr) + sizeof(uint32_t);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "using too much smem");
+
+  using ProblemShape = Shape<int, int, int, Shape<int, int>>;  // Q K D (H B), eventuall D = (D_QK, D_VO)
+  using TensorStride = TensorStrideContiguousK;  // S D (H B)
+  using RowTensorStride = Stride<_1, Stride<int, int>>;    // S (H B)
+
+  struct MainloopArguments {
+    const Element* ptr_q;
+    TensorStride stride_q;
+    const Element* ptr_k;
+    TensorStride stride_k;
+    const Element* ptr_v;
+    TensorStride stride_v;
+    const Element* ptr_do;
+    TensorStride stride_do;
+
+    const ElementAcc* ptr_lse;
+    RowTensorStride stride_lse;
+
+    const ElementAcc* ptr_sum_odo;
+    RowTensorStride stride_sum_odo;
+
+    ElementAcc* ptr_dq_acc;
+    TensorStride stride_dq_acc;
+
+    ElementAcc softmax_scale = 1.0f / sqrtf(TileShapeDQK{});
+  };
+
+  using TMA_K = typename CollectiveMmaKQ::Params::TMA_A;
+  using TMA_V = typename CollectiveMmaVDO::Params::TMA_A;
+  using TMA_Q = typename CollectiveMmaKQ::Params::TMA_B;
+  using TMA_DO = typename CollectiveMmaVDO::Params::TMA_B;
+
+  using TMA_DQ = decltype(make_tma_copy(SM90_TMA_REDUCE_ADD{},
+      make_tensor((const ElementAcc*)nullptr, make_shape(1, 1, make_shape(1, 1)), TensorStride{}),
+      SmemLayoutDQ{}(_, _, _0{})
+  ));
+
+  struct MainloopParams {
+    TMA_K tma_load_k;
+    TMA_V tma_load_v;
+    TMA_Q tma_load_q;
+    TMA_DO tma_load_do;
+    TMA_DQ tma_red_dq;
+  };
+
+  struct EpilogueArguments {
+    Element* ptr_dk;
+    TensorStride stride_dk;
+    Element* ptr_dv;
+    TensorStride stride_dv;
+  };
+
+  struct Arguments {
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    EpilogueArguments epilogue;
+    KernelHardwareInfo hw_info;
+  };
+
+  struct Params {
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    MainloopParams mainloop_params;
+    EpilogueArguments epilogue;
+    KernelHardwareInfo hw_info;
+  };
+
+
+  static bool can_implement(Arguments const& args) {
+    auto [Q, K, D, HB] = args.problem_shape;
+    auto [H, B] = HB;
+    if (Q <= 0 || K <= 0 || D <= 0 || H <= 0 || B <= 0) {
+      return false;
+    }
+    if (D % Alignment != 0) {
+      return false;
+    }
+    return true;
+  }
+
+
+  static Status initialize_workspace(Arguments const&, void*, cudaStream_t) {
+    return Status::kSuccess;
+  }
+
+
+  static Params to_underlying_arguments(Arguments const& args, void*) {
+    auto [Q, K, D, HB] = args.problem_shape;
+
+    auto params_kq = CollectiveMmaKQ::to_underlying_arguments(
+      make_shape(K, Q, D, HB),
+      typename CollectiveMmaKQ::Arguments {
+        args.mainloop.ptr_k, args.mainloop.stride_k,
+        args.mainloop.ptr_q, args.mainloop.stride_q,
+      }, /*workspace=*/nullptr);
+
+    auto params_vdo = CollectiveMmaVDO::to_underlying_arguments(
+      make_shape(K, Q, D, HB),
+      typename CollectiveMmaVDO::Arguments {
+        args.mainloop.ptr_v, args.mainloop.stride_v,
+        args.mainloop.ptr_do, args.mainloop.stride_do,
+      }, /*workspace=*/nullptr);
+
+    TMA_DQ tma_red_dq = make_tma_copy(
+        SM90_TMA_REDUCE_ADD{},
+        make_tensor(args.mainloop.ptr_dq_acc, make_shape(Q, D, HB), args.mainloop.stride_dq_acc),
+        SmemLayoutDQ{}(_, _, _0{})
+    );
+      
+    return Params{
+      args.problem_shape,
+      args.mainloop,
+      MainloopParams{
+        params_kq.tma_load_a,
+        params_vdo.tma_load_a,
+        params_kq.tma_load_b,
+        params_vdo.tma_load_b,
+        tma_red_dq
+      },
+      args.epilogue,
+      args.hw_info
+    };
+  }
+
+
+  template<class T>
+  static CUTLASS_DEVICE auto quantize(T const& input) {
+    constexpr int AlignmentS = 4;
+    auto output = make_tensor<Element>(shape(input));
+    auto input_vec = recast<Array<ElementAcc, AlignmentS>>(input);
+    auto output_vec = recast<Array<Element, AlignmentS>>(output);
+
+    cutlass::NumericArrayConverter<Element, ElementAcc, AlignmentS> epilogue_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(input_vec); i++) {
+      output_vec(i) = epilogue_op(input_vec(i));
+    }
+
+    return output;
+  }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void load(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      int iter_index,
+      int iter_count,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineLoadMmaQ& pipeline_load_mma_q,
+      typename PipelineLoadMmaQ::PipelineState& pipeline_load_mma_q_producer_state,
+      PipelineLoadMmaDO& pipeline_load_mma_do,
+      typename PipelineLoadMmaDO::PipelineState& pipeline_load_mma_do_producer_state,
+      PipelineLoadComputeLSE& pipeline_load_compute_lse,
+      typename PipelineLoadComputeLSE::PipelineState& pipeline_load_compute_lse_producer_state,
+      PipelineLoadComputeSumOdO& pipeline_load_compute_sum_odo,
+      typename PipelineLoadComputeSumOdO::PipelineState& pipeline_load_compute_sum_odo_producer_state) {
+
+    auto [Q, K, D, HB] = problem_shape;
+
+    using X = Underscore;
+
+    uint16_t mcast_mask = 0;
+
+    auto mK = mainloop_params.tma_load_k.get_tma_tensor(make_shape(K, D, HB));
+    auto mQ = mainloop_params.tma_load_q.get_tma_tensor(make_shape(Q, D, HB));
+    auto mV = mainloop_params.tma_load_v.get_tma_tensor(make_shape(K, D, HB));
+    auto mDO = mainloop_params.tma_load_do.get_tma_tensor(make_shape(Q, D, HB));
+
+    auto gK = local_tile(mK, TileShapeKQ{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gQ = local_tile(mQ, TileShapeKQ{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gV = local_tile(mV, TileShapeVDO{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gDO = local_tile(mDO, TileShapeVDO{}, make_coord(_,_,_), Step<X, _1, _1>{});
+
+    ThrMMA cta_mma_kq = TiledMmaKQ{}.get_slice(_0{});
+    ThrMMA cta_mma_vdo = TiledMmaVDO{}.get_slice(_0{});
+    
+    auto tSTgK = cta_mma_kq.partition_A(gK);
+    auto tSTgQ = cta_mma_kq.partition_B(gQ);
+    auto tDPTgV = cta_mma_vdo.partition_A(gV);
+    auto tDPTgDO = cta_mma_vdo.partition_B(gDO);
+
+    auto sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    auto sK = make_tensor(make_smem_ptr(shared_tensors.smem_k.begin()), SmemLayoutK{});
+    auto sV = make_tensor(make_smem_ptr(shared_tensors.smem_v.begin()), SmemLayoutV{});
+    auto sDO = make_tensor(make_smem_ptr(shared_tensors.smem_do.begin()), SmemLayoutDO{});
+
+    auto [tKgK_mkl, tKsK] = tma_partition(
+        mainloop_params.tma_load_k, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sK), group_modes<0,3>(tSTgK));
+    auto [tQgQ_mkl, tQsQ] = tma_partition(
+        mainloop_params.tma_load_q, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sQ), group_modes<0,3>(tSTgQ));
+    auto [tVgV_mkl, tVsV] = tma_partition(
+        mainloop_params.tma_load_v, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sV), group_modes<0,3>(tDPTgV));
+    auto [tDOgDO_mkl, tDOsDO] = tma_partition(
+        mainloop_params.tma_load_do, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sDO), group_modes<0,3>(tDPTgDO));
+
+    // set up lse and sum_odo
+    
+    auto [blk_coord_q, blk_coord_k, blk_coord_batch] = blk_coord;
+
+    pipeline_load_mma_q.producer_acquire(pipeline_load_mma_q_producer_state);
+    auto tma_barrier = pipeline_load_mma_q.producer_get_barrier(pipeline_load_mma_q_producer_state);
+
+    pipeline_load_mma_q.producer_expect_transaction(pipeline_load_mma_q_producer_state, kTransactionsBytesLoadK);
+
+    // load K
+    if (cute::elect_one_sync()) {
+      cute::copy(
+          mainloop_params.tma_load_k.with(*tma_barrier, mcast_mask),
+          tKgK_mkl(_, blk_coord_k, _0{}, blk_coord_batch),
+          tKsK(_, _0{})
+      );
+    }
+
+    // load Q
+    if (cute::elect_one_sync()) { 
+      cute::copy(
+          mainloop_params.tma_load_q.with(*tma_barrier, mcast_mask),
+          tQgQ_mkl(_, iter_index, _0{}, blk_coord_batch),
+          tQsQ(_, pipeline_load_mma_q_producer_state.index())
+      );
+    }
+
+    ++pipeline_load_mma_q_producer_state;
+
+    pipeline_load_compute_lse.producer_acquire(pipeline_load_compute_lse_producer_state);
+
+    // load LSE
+    // 32 threads loading 128 values of 32b each
+    // so 4*32b=128b
+
+    int thread_idx = threadIdx.x % NumThreadsPerWarp;
+    int smem_idx = TileShapeQ{} * pipeline_load_compute_lse_producer_state.index() + thread_idx * 4;
+    int gmem_idx = TileShapeQ{} * iter_index + thread_idx * 4;
+    auto mLSE = make_tensor(mainloop_args.ptr_lse, make_shape(Q, HB), mainloop_args.stride_lse);
+    cutlass::arch::cp_async_zfill<16>(
+        shared_tensors.smem_lse.begin() + smem_idx,
+        &mLSE(gmem_idx, blk_coord_batch),
+         gmem_idx < Q
+    );
+    
+    pipeline_load_compute_lse.producer_commit(pipeline_load_compute_lse_producer_state, cutlass::arch::cpasync_barrier_arrive);
+    ++pipeline_load_compute_lse_producer_state;
+
+
+    pipeline_load_mma_do.producer_acquire(pipeline_load_mma_do_producer_state);
+    tma_barrier = pipeline_load_mma_do.producer_get_barrier(pipeline_load_mma_do_producer_state);
+
+    pipeline_load_mma_do.producer_expect_transaction(pipeline_load_mma_do_producer_state, kTransactionsBytesLoadV);
+    
+    // load V
+    if (cute::elect_one_sync()) {
+      cute::copy(
+          mainloop_params.tma_load_v.with(*tma_barrier, mcast_mask),
+          tVgV_mkl(_, blk_coord_k, _0{}, blk_coord_batch),
+          tVsV(_, _0{})
+      );
+    }
+
+    // load dO
+    if (cute::elect_one_sync()) { 
+      cute::copy(
+          mainloop_params.tma_load_do.with(*tma_barrier, mcast_mask),
+          tDOgDO_mkl(_, iter_index, _0{}, blk_coord_batch),
+          tDOsDO(_, pipeline_load_mma_do_producer_state.index())
+      );
+    }
+
+    ++pipeline_load_mma_do_producer_state;
+
+    pipeline_load_compute_sum_odo.producer_acquire(pipeline_load_compute_sum_odo_producer_state);
+
+    // load sum_OdO
+    smem_idx = TileShapeQ{} * pipeline_load_compute_sum_odo_producer_state.index() + thread_idx * 4;
+    gmem_idx = TileShapeQ{} * iter_index + thread_idx * 4;
+    auto mSumOdO = make_tensor(mainloop_args.ptr_sum_odo, make_shape(Q, HB), mainloop_args.stride_sum_odo);
+    cutlass::arch::cp_async<16>(
+        shared_tensors.smem_sum_odo.begin() + smem_idx,
+        &mSumOdO(gmem_idx, blk_coord_batch),
+        gmem_idx < Q
+    );
+
+    pipeline_load_compute_sum_odo.producer_commit(pipeline_load_compute_sum_odo_producer_state, cutlass::arch::cpasync_barrier_arrive);
+    ++pipeline_load_compute_sum_odo_producer_state;
+
+    iter_count -= 1;
+    iter_index += 1;
+
+    while (iter_count > 0) {
+      pipeline_load_mma_q.producer_acquire(pipeline_load_mma_q_producer_state);
+      tma_barrier = pipeline_load_mma_q.producer_get_barrier(pipeline_load_mma_q_producer_state);
+
+      // load Q
+      if (cute::elect_one_sync()) { 
+        cute::copy(
+            mainloop_params.tma_load_q.with(*tma_barrier, mcast_mask),
+            tQgQ_mkl(_, iter_index, _0{}, blk_coord_batch),
+            tQsQ(_, pipeline_load_mma_q_producer_state.index())
+        );
+      }
+
+      ++pipeline_load_mma_q_producer_state;
+
+      pipeline_load_compute_lse.producer_acquire(pipeline_load_compute_lse_producer_state);
+      
+      // load LSE
+      smem_idx = TileShapeQ{} * pipeline_load_compute_lse_producer_state.index() + thread_idx * 4;
+      gmem_idx = TileShapeQ{} * iter_index + thread_idx * 4;
+      cutlass::arch::cp_async<16>(
+          shared_tensors.smem_lse.begin() + smem_idx,
+          &mLSE(gmem_idx, blk_coord_batch),
+          gmem_idx < Q
+      );
+      
+      pipeline_load_compute_lse.producer_commit(pipeline_load_compute_lse_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      ++pipeline_load_compute_lse_producer_state;
+
+      pipeline_load_mma_do.producer_acquire(pipeline_load_mma_do_producer_state);
+      tma_barrier = pipeline_load_mma_do.producer_get_barrier(pipeline_load_mma_do_producer_state);
+
+      // load dO  
+      if (cute::elect_one_sync()) { 
+        cute::copy(
+            mainloop_params.tma_load_do.with(*tma_barrier, mcast_mask),
+            tDOgDO_mkl(_, iter_index, _0{}, blk_coord_batch),
+            tDOsDO(_, pipeline_load_mma_do_producer_state.index())
+        );
+      }
+
+      ++pipeline_load_mma_do_producer_state;
+
+      pipeline_load_compute_sum_odo.producer_acquire(pipeline_load_compute_sum_odo_producer_state);
+      
+      // load sum_OdO
+      smem_idx = TileShapeQ{} * pipeline_load_compute_sum_odo_producer_state.index() + thread_idx * 4;
+      gmem_idx = TileShapeQ{} * iter_index + thread_idx * 4;
+      cutlass::arch::cp_async_zfill<16>(
+          shared_tensors.smem_sum_odo.begin() + smem_idx,
+          &mSumOdO(gmem_idx, blk_coord_batch),
+          gmem_idx < Q
+      );
+      
+      pipeline_load_compute_sum_odo.producer_commit(pipeline_load_compute_sum_odo_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      ++pipeline_load_compute_sum_odo_producer_state;
+
+      iter_count -= 1;
+      iter_index += 1;
+    }
+  }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void mma(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      int iter_index,
+      int iter_count,
+      MainloopArguments const& mainloop_args,
+      TensorStorage& shared_tensors,
+      PipelineLoadMmaQ& pipeline_load_mma_q, 
+      typename PipelineLoadMmaQ::PipelineState& pipeline_load_mma_q_consumer_state,        
+      PipelineLoadMmaDO& pipeline_load_mma_do, 
+      typename PipelineLoadMmaDO::PipelineState& pipeline_load_mma_do_consumer_state,
+      PipelineMmaComputeS& pipeline_mma_compute_s, 
+      typename PipelineMmaComputeS::PipelineState& pipeline_mma_compute_s_producer_state,
+      PipelineMmaComputeDP& pipeline_mma_compute_dp, 
+      typename PipelineMmaComputeDP::PipelineState& pipeline_mma_compute_dp_producer_state,
+      PipelineMmaReduceDQ& pipeline_mma_reduce_dq, 
+      typename PipelineMmaReduceDQ::PipelineState& pipeline_mma_reduce_dq_producer_state,
+      PipelineComputeMmaP& pipeline_compute_mma_p, 
+      typename PipelineComputeMmaP::PipelineState& pipeline_compute_mma_p_consumer_state,
+      PipelineComputeMmaDS& pipeline_compute_mma_ds, 
+      typename PipelineComputeMmaDS::PipelineState& pipeline_compute_mma_ds_consumer_state,
+      PipelineMmaComputeDKDV& pipeline_mma_compute_dkdv,
+      typename PipelineMmaComputeDKDV::PipelineState& pipeline_mma_compute_dkdv_producer_state) {
+    
+    auto [Q, K, D, HB] = problem_shape;
+
+    auto sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    auto sK = make_tensor(make_smem_ptr(shared_tensors.smem_k.begin()), SmemLayoutK{});
+    auto sV = make_tensor(make_smem_ptr(shared_tensors.smem_v.begin()), SmemLayoutV{});
+    auto sDO = make_tensor(make_smem_ptr(shared_tensors.smem_do.begin()), SmemLayoutDO{});
+
+    auto sQT = make_tensor(make_smem_ptr(shared_tensors.smem_q_t.begin()), SmemLayoutQT{});
+    auto sKT = make_tensor(make_smem_ptr(shared_tensors.smem_k_t.begin()), SmemLayoutKT{});
+    auto sDS = make_tensor(make_smem_ptr(shared_tensors.smem_ds.begin()), SmemLayoutDS{});
+    auto sDST = make_tensor(make_smem_ptr(shared_tensors.smem_ds_t.begin()), SmemLayoutDST{});
+    auto sP = make_tensor(make_smem_ptr((Element*) nullptr), typename CollectiveMmaPDO::SmemLayoutA{});
+    auto sDOT = make_tensor(make_smem_ptr(shared_tensors.smem_do_t.begin()), SmemLayoutDOT{});
+
+    Tensor tSTrK = TiledMmaKQ::make_fragment_A(sK);
+    Tensor tSTrQ = TiledMmaKQ::make_fragment_B(sQ);
+
+    Tensor tDPTrV = TiledMmaVDO::make_fragment_A(sV);
+    Tensor tDPTrDO = TiledMmaVDO::make_fragment_B(sDO);
+
+    Tensor tDQrDS = TiledMmaDSK::make_fragment_A(sDS);
+    Tensor tDQrKT = TiledMmaDSK::make_fragment_B(sKT);
+
+    Tensor tDKrDST = TiledMmaDSQ::make_fragment_A(sDST);
+    Tensor tDKrQT = TiledMmaDSQ::make_fragment_B(sQT);
+
+    Tensor tDVrP = TiledMmaPDO::make_fragment_A(sP)(_, _, _, _0{});
+    tDVrP.data() = TmemAllocation::kP;
+    Tensor tDVrDOT = TiledMmaPDO::make_fragment_B(sDOT);
+    
+    TiledMmaKQ tiled_mma_kq;
+    TiledMmaVDO tiled_mma_vdo;
+    TiledMmaDSK tiled_mma_dsk;
+    TiledMmaDSQ tiled_mma_dsq;
+    TiledMmaPDO tiled_mma_pdo;
+
+    tiled_mma_dsq.accumulate_ = UMMA::ScaleOut::Zero;
+    tiled_mma_pdo.accumulate_ = UMMA::ScaleOut::Zero;
+
+    Tensor tSTtST =  partition_fragment_C(tiled_mma_kq, select<0,1>(TileShapeKQ{}));
+    tSTtST.data() = TmemAllocation::kS;
+
+    Tensor tDPTtDPT = partition_fragment_C(tiled_mma_vdo, select<0,1>(TileShapeVDO{}));
+    tDPTtDPT.data() = TmemAllocation::kDP;
+
+    Tensor tDQtDQ = partition_fragment_C(tiled_mma_dsk, select<0,1>(TileShapeDSK{}));
+    tDQtDQ.data() = TmemAllocation::kDQ;
+
+    Tensor tDKtDK = partition_fragment_C(tiled_mma_dsq, select<0,1>(TileShapeDSQ{}));
+    tDKtDK.data() = TmemAllocation::kDK;
+
+    Tensor tDVtDV = partition_fragment_C(tiled_mma_pdo, select<0,1>(TileShapePDO{}));
+    tDVtDV.data() = TmemAllocation::kDV;
+
+    auto pipeline_load_mma_q_release_state = pipeline_load_mma_q_consumer_state;
+
+    pipeline_load_mma_q.consumer_wait(pipeline_load_mma_q_consumer_state);
+    pipeline_mma_compute_s.producer_acquire(pipeline_mma_compute_s_producer_state);
+
+    // S = Q*K
+    tiled_mma_kq.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tSTrQ); ++k_block) {
+      cute::gemm(tiled_mma_kq,
+                 tSTrK(_,_,k_block,_0{}),
+                 tSTrQ(_,_,k_block,pipeline_load_mma_q_consumer_state.index()),
+                 tSTtST);
+      tiled_mma_kq.accumulate_ = UMMA::ScaleOut::One;
+    }
+
+    ++pipeline_load_mma_q_consumer_state;
+
+    pipeline_mma_compute_s.producer_commit(pipeline_mma_compute_s_producer_state);
+    ++pipeline_mma_compute_s_producer_state;
+
+    pipeline_load_mma_do.consumer_wait(pipeline_load_mma_do_consumer_state);
+
+    pipeline_mma_compute_dp.producer_acquire(pipeline_mma_compute_dp_producer_state);
+    pipeline_mma_reduce_dq.producer_acquire(pipeline_mma_reduce_dq_producer_state);
+
+    // dP = dO*V
+    tiled_mma_vdo.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tDPTrV); ++k_block) {
+      cute::gemm(tiled_mma_vdo,
+                 tDPTrV(_,_,k_block,_0{}),
+                 tDPTrDO(_,_,k_block,pipeline_load_mma_do_consumer_state.index()),
+                 tDPTtDPT);
+      tiled_mma_vdo.accumulate_ = UMMA::ScaleOut::One;
+    }
+
+    pipeline_mma_compute_dp.producer_commit(pipeline_mma_compute_dp_producer_state);
+    ++pipeline_mma_compute_dp_producer_state;
+
+    pipeline_compute_mma_p.consumer_wait(pipeline_compute_mma_p_consumer_state);
+
+    // dV = P*dO
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tDVrP); ++k_block) {
+      cute::gemm(tiled_mma_pdo,
+                 tDVrP(_,_,k_block),
+                 tDVrDOT(_,_,k_block,pipeline_load_mma_do_consumer_state.index()),
+                 tDVtDV);
+      tiled_mma_pdo.accumulate_ = UMMA::ScaleOut::One;
+    }
+
+    pipeline_compute_mma_p.consumer_release(pipeline_compute_mma_p_consumer_state);
+    ++pipeline_compute_mma_p_consumer_state;
+
+    pipeline_load_mma_do.consumer_release(pipeline_load_mma_do_consumer_state);
+    ++pipeline_load_mma_do_consumer_state;
+
+    iter_count -= 1;
+
+    // in tmem, S & P overlap
+    // and dP and dQ overlap
+    // so we need to acquire dQ and dP at the same time
+    while (iter_count > 0) {
+      pipeline_load_mma_q.consumer_wait(pipeline_load_mma_q_consumer_state);
+      pipeline_mma_compute_s.producer_acquire(pipeline_mma_compute_s_producer_state);
+
+      // S = Q*K
+      tiled_mma_kq.accumulate_ = UMMA::ScaleOut::Zero;
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tSTrQ); ++k_block) {
+        cute::gemm(tiled_mma_kq,
+                   tSTrK(_,_,k_block,_0{}),
+                   tSTrQ(_,_,k_block,pipeline_load_mma_q_consumer_state.index()),
+                   tSTtST);
+        tiled_mma_kq.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      ++pipeline_load_mma_q_consumer_state;
+
+      pipeline_mma_compute_s.producer_commit(pipeline_mma_compute_s_producer_state);
+      ++pipeline_mma_compute_s_producer_state;
+
+      pipeline_compute_mma_ds.consumer_wait(pipeline_compute_mma_ds_consumer_state);
+
+      // we need to acquire dP here, because tmem dQ == tmem dP
+      pipeline_mma_compute_dp.producer_acquire(pipeline_mma_compute_dp_producer_state);
+
+      // dQ = dS*K
+      tiled_mma_dsk.accumulate_ = UMMA::ScaleOut::Zero;
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tDQrDS); ++k_block) {
+        cute::gemm(tiled_mma_dsk,
+                   tDQrDS(_,_,k_block,pipeline_compute_mma_ds_consumer_state.index()),
+                   tDQrKT(_,_,k_block,_0{}),
+                   tDQtDQ);
+        tiled_mma_dsk.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_mma_reduce_dq.producer_commit(pipeline_mma_reduce_dq_producer_state);
+      ++pipeline_mma_reduce_dq_producer_state;
+
+      // dK = dS*Q
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tDKrDST); ++k_block) {
+        cute::gemm(tiled_mma_dsq,
+                   tDKrDST(_,_,k_block,pipeline_compute_mma_ds_consumer_state.index()),
+                   tDKrQT(_,_,k_block,pipeline_load_mma_q_release_state.index()),
+                   tDKtDK);
+        tiled_mma_dsq.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_load_mma_q.consumer_release(pipeline_load_mma_q_release_state);
+      ++pipeline_load_mma_q_release_state;
+
+      pipeline_compute_mma_ds.consumer_release(pipeline_compute_mma_ds_consumer_state);
+      ++pipeline_compute_mma_ds_consumer_state;
+
+      // we grab dq here, because in tmem dq == dp
+      pipeline_mma_reduce_dq.producer_acquire(pipeline_mma_reduce_dq_producer_state);
+
+      pipeline_load_mma_do.consumer_wait(pipeline_load_mma_do_consumer_state);
+
+      // dP = dO*V
+      tiled_mma_vdo.accumulate_ = UMMA::ScaleOut::Zero;
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tDPTrV); ++k_block) {
+        cute::gemm(tiled_mma_vdo,
+                   tDPTrV(_,_,k_block,_0{}),
+                   tDPTrDO(_,_,k_block,pipeline_load_mma_do_consumer_state.index()),
+                   tDPTtDPT);
+        tiled_mma_vdo.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_mma_compute_dp.producer_commit(pipeline_mma_compute_dp_producer_state);
+      ++pipeline_mma_compute_dp_producer_state;
+
+      pipeline_compute_mma_p.consumer_wait(pipeline_compute_mma_p_consumer_state);
+
+      // dV = P*dO
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tDVrP); ++k_block) {
+        cute::gemm(tiled_mma_pdo,
+                   tDVrP(_,_,k_block),
+                   tDVrDOT(_,_,k_block,pipeline_load_mma_do_consumer_state.index()),
+                   tDVtDV);
+        tiled_mma_pdo.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_compute_mma_p.consumer_release(pipeline_compute_mma_p_consumer_state);
+      ++pipeline_compute_mma_p_consumer_state;
+
+      pipeline_load_mma_do.consumer_release(pipeline_load_mma_do_consumer_state);
+      ++pipeline_load_mma_do_consumer_state;
+
+      iter_count -= 1;
+    }
+
+    // signal to the epilogue that dV is ready
+    pipeline_mma_compute_dkdv.producer_acquire(pipeline_mma_compute_dkdv_producer_state);
+    pipeline_mma_compute_dkdv.producer_commit(pipeline_mma_compute_dkdv_producer_state);
+    ++pipeline_mma_compute_dkdv_producer_state;
+
+    pipeline_mma_compute_dkdv.producer_acquire(pipeline_mma_compute_dkdv_producer_state);
+
+    pipeline_compute_mma_ds.consumer_wait(pipeline_compute_mma_ds_consumer_state);
+
+    // dK = dS*Q
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tDKrDST); ++k_block) {
+      cute::gemm(tiled_mma_dsq,
+                 tDKrDST(_,_,k_block,pipeline_compute_mma_ds_consumer_state.index()),
+                 tDKrQT(_,_,k_block,pipeline_load_mma_q_release_state.index()),
+                 tDKtDK);
+      tiled_mma_dsq.accumulate_ = UMMA::ScaleOut::One;
+    }
+
+    // signal to epilgue that dK is ready
+    pipeline_mma_compute_dkdv.producer_commit(pipeline_mma_compute_dkdv_producer_state);
+    ++pipeline_mma_compute_dkdv_producer_state;
+
+    // we've already acquired mma_reduce_dq in the loop
+
+    // dQ = dS*K
+    tiled_mma_dsk.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tDQrDS); ++k_block) {
+      cute::gemm(tiled_mma_dsk,
+                 tDQrDS(_,_,k_block,pipeline_compute_mma_ds_consumer_state.index()),
+                 tDQrKT(_,_,k_block,_0{}),
+                 tDQtDQ);
+      tiled_mma_dsk.accumulate_ = UMMA::ScaleOut::One;
+    }
+
+    pipeline_mma_reduce_dq.producer_commit(pipeline_mma_reduce_dq_producer_state);
+    ++pipeline_mma_reduce_dq_producer_state;
+
+    pipeline_load_mma_q.consumer_release(pipeline_load_mma_q_release_state);
+    ++pipeline_load_mma_q_release_state;
+
+    pipeline_compute_mma_ds.consumer_release(pipeline_compute_mma_ds_consumer_state);
+    ++pipeline_compute_mma_ds_consumer_state;
+  }
+
+
+
+  template<class TensorG, class TensorR, class TensorC, class TensorShape>
+  CUTLASS_DEVICE void store(
+      TensorG gmem,
+      TensorR const& regs,
+      TensorC const& coord,
+      TensorShape const& tensor_shape) {
+
+    auto copy_op = make_cotiled_copy(
+        Copy_Atom<UniversalCopy<uint128_t>, Element>{},
+        make_layout(make_shape(_1{}, Int<sizeof(uint128_t) / sizeof(Element)>{})),
+        regs.layout()
+    );
+    auto thr_copy = copy_op.get_slice(_0{});
+
+    auto tCg = thr_copy.partition_D(gmem);
+    auto tCr = thr_copy.partition_S(quantize(regs));
+    auto tCc = thr_copy.partition_D(coord);
+
+    constexpr int R = decltype(tCr.layout())::rank;
+    auto tCg_v = group_modes<1, R>(tCg);
+    auto tCr_v = group_modes<1, R>(tCr);
+    auto tCc_v = group_modes<1, R>(tCc);
+    auto tCp_v = make_tensor<bool>(shape<1>(tCc_v));
+
+    for (int i = 0; i < size(tCp_v); ++i) {
+      tCp_v(i) = elem_less(tCc_v(_0{},i), tensor_shape);
+    }
+
+    copy_if(copy_op, tCp_v, tCr_v, tCg_v);
+  }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void epilogue(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      EpilogueArguments const& epilogue_args,
+      PipelineMmaComputeDKDV& pipeline_mma_compute_dkdv,
+      typename PipelineMmaComputeDKDV::PipelineState& pipeline_mma_compute_dkdv_consumer_state) {
+
+    auto [Q, K, D, HB] = problem_shape;
+    auto [blk_coord_q, blk_coord_k, blk_coord_batch] = blk_coord;
+
+    auto load_op = SM100_TMEM_LOAD_32dp32b16x{};
+
+    auto tDKtDK = partition_fragment_C(TiledMmaDSQ{}, select<0,1>(TileShapeDSQ{}))(make_coord(_,_),_0{},_0{});
+    tDKtDK.data() = TmemAllocation::kDK;
+
+    auto mDK = make_tensor(make_gmem_ptr(epilogue_args.ptr_dk), make_shape(K, TileShapeDQK{}, HB), epilogue_args.stride_dk);
+    auto gDK = local_tile(mDK, TileShapeDSQ{}, make_coord(_,_,_), Step<_1, _1, X>{})
+        (_, _, blk_coord_k, _0{}, blk_coord_batch);
+
+    Tensor cDK = domain_offset(
+        make_coord(get<1>(blk_coord) * TileShapeK{}, _0{}),
+        make_identity_tensor(take<0,2>(TileShapeDSQ{}))
+    );
+
+    constexpr int kNumWarpgroups = kNumComputeWarps / 4;
+    int dp_idx = threadIdx.x % 128;
+    int wg_idx = (threadIdx.x % (kNumComputeWarps * NumThreadsPerWarp)) / 128;
+
+    auto split_wg = [&](auto const& t) {
+      if constexpr (decltype(rank(t))::value == 3) {
+        auto p = t.compose(make_layout(make_shape(size<0>(t), size<1>(t), make_shape(Int<kNumWarpgroups>{}, size<2>(t) / Int<kNumWarpgroups>{}))));
+        return p(_, _, make_coord(wg_idx, _));
+      }
+      else {
+        auto p = t.compose(make_layout(make_shape(size<0>(t), size<1>(t), size<2>(t), make_shape(Int<kNumWarpgroups>{}, size<3>(t) / Int<kNumWarpgroups>{}))));
+        return p(_, _, _, make_coord(wg_idx, _));
+      }
+    };
+
+    auto tiled_t2r_dk = make_tmem_copy(load_op, tDKtDK);
+    auto thread_t2r_dk = tiled_t2r_dk.get_slice(dp_idx);
+
+    Tensor tTR_cDK   = split_wg(thread_t2r_dk.partition_D(cDK));
+    Tensor tTR_gDK   = split_wg(thread_t2r_dk.partition_D(gDK));
+    Tensor tTR_rDK = make_tensor<ElementAcc>(shape(tTR_cDK));
+    Tensor tTR_tDK = split_wg(thread_t2r_dk.partition_S(tDKtDK));
+
+    auto tDVtDV = partition_fragment_C(TiledMmaDSQ{}, select<0,1>(TileShapeDSQ{}))(make_coord(_,_),_0{},_0{});
+    tDVtDV.data() = TmemAllocation::kDV;
+
+    auto mDV = make_tensor(make_gmem_ptr(epilogue_args.ptr_dv), make_shape(K, TileShapeDVO{}, HB), epilogue_args.stride_dv);
+    auto gDV = local_tile(mDV, TileShapePDO{}, make_coord(_,_,_), Step<_1, _1, X>{})
+        (_, _, blk_coord_k, _0{}, blk_coord_batch);
+
+    Tensor cDV = domain_offset(
+        make_coord(get<1>(blk_coord) * TileShapeK{}, _0{}),
+        make_identity_tensor(take<0,2>(TileShapePDO{}))
+    );
+
+    auto tiled_t2r_dv = make_tmem_copy(load_op, tDVtDV);
+    auto thread_t2r_dv = tiled_t2r_dv.get_slice(dp_idx);
+
+    Tensor tTR_cDV   = split_wg(thread_t2r_dv.partition_D(cDV));
+    Tensor tTR_gDV   = split_wg(thread_t2r_dv.partition_D(gDV));
+    Tensor tTR_rDV = make_tensor<ElementAcc>(shape(tTR_cDV));
+    Tensor tTR_tDV = split_wg(thread_t2r_dv.partition_S(tDVtDV));
+
+    pipeline_mma_compute_dkdv.consumer_wait(pipeline_mma_compute_dkdv_consumer_state);
+
+    // load tDVtDV
+    cute::copy(tiled_t2r_dv, tTR_tDV, tTR_rDV);
+
+    // store tDVgDV
+    store(tTR_gDV, tTR_rDV, tTR_cDV, select<1,2>(problem_shape));
+
+    cutlass::arch::fence_view_async_tmem_load();
+    pipeline_mma_compute_dkdv.consumer_release(pipeline_mma_compute_dkdv_consumer_state);
+    ++pipeline_mma_compute_dkdv_consumer_state;
+
+    pipeline_mma_compute_dkdv.consumer_wait(pipeline_mma_compute_dkdv_consumer_state);
+
+    // load tDKtDK
+    cute::copy(tiled_t2r_dk, tTR_tDK, tTR_rDK);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rDK); i++) {
+      tTR_rDK(i) = mainloop_args.softmax_scale * tTR_rDK(i);
+    }
+
+    // store tDKgDK
+    store(tTR_gDK, tTR_rDK, tTR_cDK, select<1,2>(problem_shape));
+
+    cutlass::arch::fence_view_async_tmem_load();
+    pipeline_mma_compute_dkdv.consumer_release(pipeline_mma_compute_dkdv_consumer_state);
+    ++pipeline_mma_compute_dkdv_consumer_state;
+
+  }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void compute(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      int iter_index,
+      int iter_count,
+      MainloopArguments const& mainloop_args,
+      EpilogueArguments const& epilogue_args,
+      TensorStorage& shared_tensors,
+      PipelineLoadComputeLSE& pipeline_load_compute_lse,
+      typename PipelineLoadComputeLSE::PipelineState& pipeline_load_compute_lse_consumer_state,
+      PipelineLoadComputeSumOdO& pipeline_load_compute_sum_odo,
+      typename PipelineLoadComputeSumOdO::PipelineState& pipeline_load_compute_sum_odo_consumer_state,
+      PipelineMmaComputeS& pipeline_mma_compute_s,
+      typename PipelineMmaComputeS::PipelineState& pipeline_mma_compute_s_consumer_state,
+      PipelineMmaComputeDP& pipeline_mma_compute_dp,
+      typename PipelineMmaComputeDP::PipelineState& pipeline_mma_compute_dp_consumer_state,
+      PipelineComputeMmaP& pipeline_compute_mma_p,
+      typename PipelineComputeMmaP::PipelineState& pipeline_compute_mma_p_producer_state,
+      PipelineComputeMmaDS& pipeline_compute_mma_ds,
+      typename PipelineComputeMmaDS::PipelineState& pipeline_compute_mma_ds_producer_state,
+      PipelineMmaComputeDKDV& pipeline_mma_compute_dkdv,
+      typename PipelineMmaComputeDKDV::PipelineState& pipeline_mma_compute_dkdv_consumer_state) {
+
+    
+    auto [Q, K, D, HB] = problem_shape;
+
+    // in tmem, S & P overlap
+    // and dP and dQ overlap
+
+    // there are two compute wg's that cooperatively compute softmax
+    // they are striped by this tmem atom, i.e. wg0 has 16 elems, then wg1 etc
+
+    auto load_op = SM100_TMEM_LOAD_32dp32b16x{};
+    auto store_op = SM100_TMEM_STORE_32dp32b8x{};
+
+    Tensor tSTtST =  partition_fragment_C(TiledMmaKQ{}, select<0,1>(TileShapeKQ{}))(make_coord(_,_),_0{},_0{});
+    tSTtST.data() = TmemAllocation::kS;
+
+    Tensor tDPTtDPT =  partition_fragment_C(TiledMmaVDO{}, select<0,1>(TileShapeVDO{}))(make_coord(_,_),_0{},_0{});
+    tDPTtDPT.data() = TmemAllocation::kDP;
+
+    Tensor cST = make_identity_tensor(take<0,2>(TileShapeKQ{}));
+    Tensor cDPT = make_identity_tensor(take<0,2>(TileShapeVDO{}));
+
+    constexpr int kNumWarpgroups = kNumComputeWarps / 4;
+    int dp_idx = threadIdx.x % 128;
+    int wg_idx = (threadIdx.x % (kNumComputeWarps * NumThreadsPerWarp)) / 128;
+    auto tiled_t2r = make_tmem_copy(load_op, tSTtST);
+    auto thread_t2r = tiled_t2r.get_slice(dp_idx);
+
+    auto split_wg = [&](auto const& t) {
+      if constexpr (decltype(rank(t))::value == 3) {
+        auto p = t.compose(make_layout(make_shape(size<0>(t), size<1>(t), make_shape(Int<kNumWarpgroups>{}, size<2>(t) / Int<kNumWarpgroups>{}))));
+        return p(_, _, make_coord(wg_idx, _));
+      }
+      else {
+        auto p = t.compose(make_layout(make_shape(size<0>(t), size<1>(t), size<2>(t), make_shape(Int<kNumWarpgroups>{}, size<3>(t) / Int<kNumWarpgroups>{}))));
+        return p(_, _, _, make_coord(wg_idx, _));
+      }
+    };
+
+    Tensor tTR_cST   = split_wg(thread_t2r.partition_D(cST));
+    Tensor tTR_rST = make_tensor<ElementAcc>(shape(tTR_cST));
+    Tensor tTR_tST = split_wg(thread_t2r.partition_S(tSTtST));
+    
+    Tensor tTR_cDPT_p = thread_t2r.partition_D(cDPT);
+    Tensor tTR_cDPT = split_wg(tTR_cDPT_p);
+    Tensor tTR_rDPT = make_tensor<ElementAcc>(shape(tTR_cDPT));
+    Tensor tTR_tDPT = split_wg(thread_t2r.partition_S(tDPTtDPT));
+
+    Tensor sLSE = make_tensor(make_smem_ptr(shared_tensors.smem_lse.begin()), SmemLayoutLSE{});
+    Tensor sSumOdO = make_tensor(make_smem_ptr(shared_tensors.smem_sum_odo.begin()), SmemLayoutSumOdO{});
+
+    auto sP = make_tensor(make_smem_ptr((Element*) nullptr), typename CollectiveMmaPDO::SmemLayoutA{});
+
+
+    auto tDVrP = TiledMmaPDO::make_fragment_A(sP)(_, _, _, _0{});
+    auto tDVcST = TiledMmaPDO{}.get_slice(_0{}).partition_A(cST);
+    tDVrP.data() = TmemAllocation::kP;
+
+    auto tiled_r2t = make_tmem_copy(store_op, tDVrP);
+    auto thread_r2t = tiled_r2t.get_slice(dp_idx);
+
+    auto tRT_tP = split_wg(thread_r2t.partition_D(tDVrP));
+    auto tRT_cST = split_wg(thread_r2t.partition_S(tDVcST));
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (iter_count > 0) {
+      // wait for S and P
+      pipeline_mma_compute_s.consumer_wait(pipeline_mma_compute_s_consumer_state);
+      pipeline_compute_mma_p.producer_acquire(pipeline_compute_mma_p_producer_state);
+      // wait for LSE
+      pipeline_load_compute_lse.consumer_wait(pipeline_load_compute_lse_consumer_state);
+
+      auto dispatch_bool = [](bool b, auto fn) {
+        if (b) {
+          fn(cute::true_type{});
+        }
+        else {
+          fn(cute::false_type{});
+        }
+      };
+      
+      dispatch_bool(std::is_base_of_v<cutlass::fmha::collective::CausalMask, Mask> &&
+          warp_uniform(iter_index == get<1>(blk_coord)), [&](auto is_causal_masked_tile) {
+
+        // compute P = softmax(S, LSE)
+        cute::copy(tiled_t2r, tTR_tST, tTR_rST);
+  
+        if constexpr (std::is_base_of_v<cutlass::fmha::collective::CausalMask, Mask> && decltype(is_causal_masked_tile)::value) {
+          Mask{}.apply_mask(tTR_rST, [&](int i) {
+            auto c_transpose = tTR_cST(i);
+            return make_coord(get<1>(c_transpose) + iter_index * TileShapeQ{}, get<0>(c_transpose) + get<1>(blk_coord) * TileShapeK{});
+          }, problem_shape);
+        }
+  
+        ElementAcc log2_e = static_cast<ElementAcc>(M_LOG2E);
+        float2 softmax_scale_log2_e;
+        softmax_scale_log2_e.x = mainloop_args.softmax_scale * log2_e;
+        softmax_scale_log2_e.y = mainloop_args.softmax_scale * log2_e;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tTR_rST); i += 2) {
+          float2 acc;
+          float2 lse;
+          float2 out;
+          acc.x = tTR_rST(i);
+          acc.y = tTR_rST(i + 1);
+          lse.x = sLSE(get<1>(tTR_cST(i)), pipeline_load_compute_lse_consumer_state.index());
+          lse.y = sLSE(get<1>(tTR_cST(i+1)), pipeline_load_compute_lse_consumer_state.index());
+          cute::fma(out, softmax_scale_log2_e, acc, lse);
+          tTR_rST(i) = ::exp2f(out.x);
+          tTR_rST(i+1) = ::exp2f(out.y);
+        }
+  
+        auto tRT_rST = quantize(tTR_rST);
+        auto tRT_rST_reshaped = make_tensor(tRT_rST.data(), shape(tRT_cST));
+        
+        cutlass::arch::fence_view_async_tmem_load();
+        cutlass::arch::NamedBarrier(
+          kNumComputeWarps * NumThreadsPerWarp,
+          cutlass::arch::ReservedNamedBarriers::TransformBarrier
+        ).arrive_and_wait();
+        
+        cute::copy(tiled_r2t, tRT_rST_reshaped, tRT_tP);
+      });
+
+      // notify for P
+      cutlass::arch::fence_view_async_tmem_store();
+      pipeline_compute_mma_p.producer_commit(pipeline_compute_mma_p_producer_state);
+      ++pipeline_compute_mma_p_producer_state;
+      // release S
+      pipeline_mma_compute_s.consumer_release(pipeline_mma_compute_s_consumer_state);
+      ++pipeline_mma_compute_s_consumer_state;
+      // release LSE
+      pipeline_load_compute_lse.consumer_release(pipeline_load_compute_lse_consumer_state);
+      ++pipeline_load_compute_lse_consumer_state;
+
+      // wait for OdO
+      pipeline_load_compute_sum_odo.consumer_wait(pipeline_load_compute_sum_odo_consumer_state);
+      // wait for dP
+      pipeline_mma_compute_dp.consumer_wait(pipeline_mma_compute_dp_consumer_state);
+
+      // wait for dS
+      // in principle, we could defer waiting for dS, and move in the freeing of dP
+      // however, that would force us to keep dS in registers longer
+      pipeline_compute_mma_ds.producer_acquire(pipeline_compute_mma_ds_producer_state);
+
+      // compute dS = dsoftmax(P, dP, sum_OdO)
+      cute::copy(tiled_t2r, tTR_tDPT, tTR_rDPT);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rDPT); i += 2) {
+        float2 st;
+        st.x = tTR_rST(i);
+        st.y = tTR_rST(i+1);
+        float2 dpt;
+        dpt.x = tTR_rDPT(i);
+        dpt.y = tTR_rDPT(i+1);
+        float2 odo;
+        odo.x = sSumOdO(get<1>(tTR_cDPT(i)), pipeline_load_compute_sum_odo_consumer_state.index());
+        odo.y = sSumOdO(get<1>(tTR_cDPT(i+1)), pipeline_load_compute_sum_odo_consumer_state.index());
+        float2 dif;
+        // sum odo is negated during preprocess
+        cute::add(dif, dpt, odo);
+        float2 out;
+        cute::mul(out, dif, st);
+        tTR_rDPT(i) = out.x;
+        tTR_rDPT(i+1) = out.y;
+      }
+
+      auto tTR_rDST = quantize(tTR_rDPT);
+
+      // release dP
+      cutlass::arch::fence_view_async_tmem_load();
+      pipeline_mma_compute_dp.consumer_release(pipeline_mma_compute_dp_consumer_state);
+      ++pipeline_mma_compute_dp_consumer_state;
+
+      Tensor sDS = make_tensor(make_smem_ptr((Element*) shared_tensors.smem_ds.begin()), SmemLayoutDS{})
+          (_, _, _, pipeline_compute_mma_ds_producer_state.index());
+
+      auto thread_layout = make_ordered_layout(
+          make_shape(_128{}, _128{}),
+          make_stride(_1{}, _0{})
+      );
+
+      auto sDS_pi = as_position_independent_swizzle_tensor(sDS);
+      auto sDS_pi_slice_p = sDS_pi.compose(thread_layout)(dp_idx, _).compose(make_layout(shape(tTR_cDPT_p)));
+      auto sDS_pi_slice = split_wg(sDS_pi_slice_p);
+
+      copy_aligned(tTR_rDST, sDS_pi_slice);
+
+      // notify for dS
+      cutlass::arch::fence_view_async_shared();
+      pipeline_compute_mma_ds.producer_commit(pipeline_compute_mma_ds_producer_state);
+      ++pipeline_compute_mma_ds_producer_state;
+      // release OdO
+      pipeline_load_compute_sum_odo.consumer_release(pipeline_load_compute_sum_odo_consumer_state);
+      ++pipeline_load_compute_sum_odo_consumer_state;
+
+      iter_count -= 1;
+      iter_index += 1;
+    }
+
+    epilogue(
+        blk_coord, problem_shape, mainloop_args, epilogue_args,
+        pipeline_mma_compute_dkdv, pipeline_mma_compute_dkdv_consumer_state
+    );
+  }
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void reduce(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      int iter_index,
+      int iter_count,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineMmaReduceDQ& pipeline_mma_reduce_dq,
+      typename PipelineMmaReduceDQ::PipelineState& pipeline_mma_reduce_dq_consumer_state,
+      PipelineReduceTmaStore& pipeline_reduce_tma_store,
+      typename PipelineReduceTmaStore::PipelineState& pipeline_reduce_tma_store_producer_state) {
+    
+    using X = Underscore;
+    
+    auto [Q, K, D, HB] = problem_shape;
+
+    auto [blk_coord_q, blk_coord_k, blk_coord_batch] = blk_coord;
+
+    // must match TileShapeDQ
+    auto load_op = SM100_TMEM_LOAD_32dp32b32x{};
+
+    auto tDQtDQ = partition_fragment_C(TiledMmaDSK{}, select<0,1>(TileShapeDSK{}))(make_coord(_,_),_0{},_0{});
+    tDQtDQ.data() = TmemAllocation::kDQ;
+
+    Tensor mDQ = mainloop_params.tma_red_dq.get_tma_tensor(make_shape(Q, D, HB));
+    auto gDQ = local_tile(mDQ, TileShapeKQ{}, make_coord(_,_,_), Step<_1, _1, X>{})
+        (_, _, _, _0{}, blk_coord_batch);
+
+    Tensor cDQ = make_identity_tensor(take<0,2>(TileShapeDSK{}));
+
+    Tensor sDQ = make_tensor(make_smem_ptr(shared_tensors.smem_dq.begin()), SmemLayoutDQ{});
+
+    int thread_idx = threadIdx.x % (kNumComputeWarps * NumThreadsPerWarp);
+    auto tiled_t2r = make_tmem_copy(load_op, tDQtDQ);
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+
+    Tensor tTR_cDQ   = thread_t2r.partition_D(cDQ);
+    Tensor tTR_gDQ   = thread_t2r.partition_D(gDQ);
+    Tensor tTR_sDQ   = thread_t2r.partition_D(sDQ);
+    Tensor tTR_tDQ = thread_t2r.partition_S(tDQtDQ);
+
+    auto block_tma = mainloop_params.tma_red_dq.get_slice(_0{});
+
+    Tensor tDQsDQ = block_tma.partition_S(sDQ);
+    Tensor tDQcDQ = block_tma.partition_S(cDQ);
+    Tensor tDQgDQ = block_tma.partition_D(gDQ);
+
+    int lane_predicate = (threadIdx.x % (kNumReduceWarps * NumThreadsPerWarp)) == 0;
+
+    while (iter_count > 0) {
+      pipeline_mma_reduce_dq.consumer_wait(pipeline_mma_reduce_dq_consumer_state);
+
+      Tensor tTR_rDQ = make_tensor<ElementAcc>(shape(tTR_cDQ));
+
+      // load dQ from tmem to rmem
+      cute::copy(tiled_t2r, tTR_tDQ, tTR_rDQ);
+
+      cutlass::arch::fence_view_async_tmem_load();
+      pipeline_mma_reduce_dq.consumer_release(pipeline_mma_reduce_dq_consumer_state);
+      ++pipeline_mma_reduce_dq_consumer_state;
+
+      // we don't have enough smem to dump it all to smem, so we do it in stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<2>(tTR_cDQ); i++) {
+        if (lane_predicate) {
+          pipeline_reduce_tma_store.producer_acquire(pipeline_reduce_tma_store_producer_state);
+        }
+        // wait in all threads for the acquire to complete
+        cutlass::arch::NamedBarrier(
+            kNumReduceWarps * NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::TransposeBarrier
+        ).arrive_and_wait();
+
+        cute::copy(tTR_rDQ(_, _, i), tTR_sDQ(_, _, _0{}, pipeline_reduce_tma_store_producer_state.index()));
+
+        // wait for the stores to all be visible to the TMA
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier(
+            kNumReduceWarps * NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::TransposeBarrier
+        ).arrive_and_wait();
+        if (lane_predicate) {
+          // launch tma store
+          copy(mainloop_params.tma_red_dq, tDQsDQ(_,_,_0{}, pipeline_reduce_tma_store_producer_state.index()), tDQgDQ(_,_,i,iter_index));
+          pipeline_reduce_tma_store.producer_commit(pipeline_reduce_tma_store_producer_state);
+        }
+
+        ++pipeline_reduce_tma_store_producer_state;
+      }
+
+      iter_count -= 1;
+      iter_index += 1;
+    }
+  }
+  
+
+  CUTLASS_DEVICE void operator()(Params const& params, char* smem) {
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    auto role = warp_idx_to_role(warp_idx);
+    uint32_t lane_predicate = cute::elect_one_sync();
+
+    if (role == WarpRole::Load && lane_predicate) {
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_q.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_k.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_v.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_do.get_tma_descriptor());
+    }
+
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem);
+
+    int initializing_warp = 0;
+    typename PipelineLoadMmaQ::Params pipeline_load_mma_q_params;
+    if (role == WarpRole::Load) {
+      pipeline_load_mma_q_params.role = PipelineLoadMmaQ::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Mma) {
+      pipeline_load_mma_q_params.role = PipelineLoadMmaQ::ThreadCategory::Consumer;
+    }
+    pipeline_load_mma_q_params.is_leader = lane_predicate && (role == WarpRole::Load);
+    // Also loads K in the first iteration
+    pipeline_load_mma_q_params.transaction_bytes = kTransactionsBytesLoadQ;
+    pipeline_load_mma_q_params.initializing_warp = initializing_warp++;
+    PipelineLoadMmaQ pipeline_load_mma_q(shared_storage.pipelines.load_mma_q, pipeline_load_mma_q_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineLoadMmaDO::Params pipeline_load_mma_do_params;
+    if (role == WarpRole::Load) {
+      pipeline_load_mma_do_params.role = PipelineLoadMmaDO::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Mma) {
+      pipeline_load_mma_do_params.role = PipelineLoadMmaDO::ThreadCategory::Consumer;
+    }
+    pipeline_load_mma_do_params.is_leader = lane_predicate && (role == WarpRole::Load);
+    // Also loads V in the first iteration
+    pipeline_load_mma_do_params.transaction_bytes = kTransactionsBytesLoadDO;
+    pipeline_load_mma_do_params.initializing_warp = initializing_warp++;
+    PipelineLoadMmaDO pipeline_load_mma_do(shared_storage.pipelines.load_mma_do, pipeline_load_mma_do_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineLoadComputeLSE::Params pipeline_load_compute_lse_params;
+    if (role == WarpRole::Load) {
+      pipeline_load_compute_lse_params.role = PipelineLoadComputeLSE::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_load_compute_lse_params.role = PipelineLoadComputeLSE::ThreadCategory::Consumer;
+    }
+    pipeline_load_compute_lse_params.producer_arv_count = NumThreadsPerWarp;
+    pipeline_load_compute_lse_params.consumer_arv_count = kNumComputeWarps * NumThreadsPerWarp;
+    pipeline_load_compute_lse_params.initializing_warp = initializing_warp++;
+    PipelineLoadComputeLSE pipeline_load_compute_lse(
+      shared_storage.pipelines.load_compute_lse,
+      pipeline_load_compute_lse_params,
+      /*barrier init*/ cute::true_type{});
+
+    typename PipelineLoadComputeSumOdO::Params pipeline_load_compute_sum_odo_params;
+    if (role == WarpRole::Load) {
+      pipeline_load_compute_sum_odo_params.role = PipelineLoadComputeSumOdO::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_load_compute_sum_odo_params.role = PipelineLoadComputeSumOdO::ThreadCategory::Consumer;
+    }
+    pipeline_load_compute_sum_odo_params.producer_arv_count = NumThreadsPerWarp;
+    pipeline_load_compute_sum_odo_params.consumer_arv_count = kNumComputeWarps * NumThreadsPerWarp;
+    pipeline_load_compute_sum_odo_params.initializing_warp = initializing_warp++;
+    PipelineLoadComputeSumOdO pipeline_load_compute_sum_odo(
+      shared_storage.pipelines.load_compute_sum_odo,
+      pipeline_load_compute_sum_odo_params,
+      /*barrier init*/ cute::true_type{});
+
+    typename PipelineMmaComputeS::Params pipeline_mma_compute_s_params;
+    if (role == WarpRole::Mma) {
+      pipeline_mma_compute_s_params.role = PipelineMmaComputeS::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_mma_compute_s_params.role = PipelineMmaComputeS::ThreadCategory::Consumer;
+    }
+    pipeline_mma_compute_s_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp;
+    pipeline_mma_compute_s_params.initializing_warp = initializing_warp++;
+    PipelineMmaComputeS pipeline_mma_compute_s(
+      shared_storage.pipelines.mma_compute_s,
+      pipeline_mma_compute_s_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineMmaComputeDP::Params pipeline_mma_compute_dp_params;
+    if (role == WarpRole::Mma) {
+      pipeline_mma_compute_dp_params.role = PipelineMmaComputeDP::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_mma_compute_dp_params.role = PipelineMmaComputeDP::ThreadCategory::Consumer;
+    }
+    pipeline_mma_compute_dp_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp;
+    pipeline_mma_compute_dp_params.initializing_warp = initializing_warp++;
+    PipelineMmaComputeDP pipeline_mma_compute_dp(
+      shared_storage.pipelines.mma_compute_dp,
+      pipeline_mma_compute_dp_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineMmaReduceDQ::Params pipeline_mma_reduce_dq_params;
+    if (role == WarpRole::Mma) {
+      pipeline_mma_reduce_dq_params.role = PipelineMmaReduceDQ::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Reduce) {
+      pipeline_mma_reduce_dq_params.role = PipelineMmaReduceDQ::ThreadCategory::Consumer;
+    }
+    pipeline_mma_reduce_dq_params.consumer_arv_count = kNumReduceWarps * cutlass::NumThreadsPerWarp;
+    pipeline_mma_reduce_dq_params.initializing_warp = initializing_warp++;
+    PipelineMmaReduceDQ pipeline_mma_reduce_dq(
+      shared_storage.pipelines.mma_reduce_dq,
+      pipeline_mma_reduce_dq_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineComputeMmaP::Params pipeline_compute_mma_p_params;
+    if (role == WarpRole::Mma) {
+      pipeline_compute_mma_p_params.role = PipelineComputeMmaP::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_compute_mma_p_params.role = PipelineComputeMmaP::ThreadCategory::Producer;
+    }
+    pipeline_compute_mma_p_params.producer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp;
+    pipeline_compute_mma_p_params.consumer_arv_count = 1;
+    pipeline_compute_mma_p_params.initializing_warp = initializing_warp++;
+    PipelineComputeMmaP pipeline_compute_mma_p(
+      shared_storage.pipelines.compute_mma_p,
+      pipeline_compute_mma_p_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineComputeMmaDS::Params pipeline_compute_mma_ds_params;
+    if (role == WarpRole::Mma) {
+      pipeline_compute_mma_ds_params.role = PipelineComputeMmaDS::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_compute_mma_ds_params.role = PipelineComputeMmaDS::ThreadCategory::Producer;
+    }
+    pipeline_compute_mma_ds_params.producer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp;
+    pipeline_compute_mma_ds_params.consumer_arv_count = 1;
+    pipeline_compute_mma_ds_params.initializing_warp = initializing_warp++;
+    PipelineComputeMmaDS pipeline_compute_mma_ds(
+      shared_storage.pipelines.compute_mma_ds,
+      pipeline_compute_mma_ds_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineMmaComputeDKDV::Params pipeline_mma_compute_dkdv_params;
+    if (role == WarpRole::Mma) {
+      pipeline_mma_compute_dkdv_params.role = PipelineMmaComputeDKDV::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::Compute) {
+      pipeline_mma_compute_dkdv_params.role = PipelineMmaComputeDKDV::ThreadCategory::Consumer;
+    }
+    pipeline_mma_compute_dkdv_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp;
+    pipeline_mma_compute_dkdv_params.initializing_warp = initializing_warp++;
+    PipelineMmaComputeDKDV pipeline_mma_compute_dkdv(
+      shared_storage.pipelines.mma_compute_dkdv,
+      pipeline_mma_compute_dkdv_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+    PipelineReduceTmaStore pipeline_reduce_tma_store;
+
+    TmemAllocator tmem_allocator;
+
+    pipeline_init_arrive_relaxed(size(ClusterShape{}));
+
+    pipeline_load_mma_q.init_masks(ClusterShape{});
+    pipeline_load_mma_do.init_masks(ClusterShape{});
+    pipeline_mma_compute_s.init_masks(ClusterShape{});
+    pipeline_mma_compute_dp.init_masks(ClusterShape{});
+    pipeline_mma_reduce_dq.init_masks(ClusterShape{});
+    pipeline_compute_mma_p.init_masks(ClusterShape{});
+    pipeline_compute_mma_ds.init_masks(ClusterShape{});
+    pipeline_mma_compute_dkdv.init_masks(ClusterShape{});
+
+    typename decltype(pipeline_load_mma_q)::PipelineState pipeline_load_mma_q_consumer_state;
+    typename decltype(pipeline_load_mma_do)::PipelineState pipeline_load_mma_do_consumer_state;
+    typename decltype(pipeline_load_compute_lse)::PipelineState pipeline_load_compute_lse_consumer_state;
+    typename decltype(pipeline_load_compute_sum_odo)::PipelineState pipeline_load_compute_sum_odo_consumer_state;
+    typename decltype(pipeline_mma_compute_s)::PipelineState pipeline_mma_compute_s_consumer_state;
+    typename decltype(pipeline_mma_compute_dp)::PipelineState pipeline_mma_compute_dp_consumer_state;
+    typename decltype(pipeline_mma_reduce_dq)::PipelineState pipeline_mma_reduce_dq_consumer_state;
+    typename decltype(pipeline_compute_mma_p)::PipelineState pipeline_compute_mma_p_consumer_state;
+    typename decltype(pipeline_compute_mma_ds)::PipelineState pipeline_compute_mma_ds_consumer_state;
+    typename decltype(pipeline_mma_compute_dkdv)::PipelineState pipeline_mma_compute_dkdv_consumer_state;
+    
+    auto pipeline_load_mma_q_producer_state = make_producer_start_state<decltype(pipeline_load_mma_q)>();
+    auto pipeline_load_mma_do_producer_state = make_producer_start_state<decltype(pipeline_load_mma_do)>();
+    auto pipeline_load_compute_lse_producer_state = make_producer_start_state<decltype(pipeline_load_compute_lse)>();
+    auto pipeline_load_compute_sum_odo_producer_state = make_producer_start_state<decltype(pipeline_load_compute_sum_odo)>();
+    auto pipeline_mma_compute_s_producer_state = make_producer_start_state<decltype(pipeline_mma_compute_s)>();
+    auto pipeline_mma_compute_dp_producer_state = make_producer_start_state<decltype(pipeline_mma_compute_dp)>();
+    auto pipeline_mma_reduce_dq_producer_state = make_producer_start_state<decltype(pipeline_mma_reduce_dq)>();
+    auto pipeline_compute_mma_p_producer_state = make_producer_start_state<decltype(pipeline_compute_mma_p)>();
+    auto pipeline_compute_mma_ds_producer_state = make_producer_start_state<decltype(pipeline_compute_mma_ds)>();
+    auto pipeline_mma_compute_dkdv_producer_state = make_producer_start_state<decltype(pipeline_mma_compute_dkdv)>();
+    auto pipeline_reduce_tma_store_producer_state = make_producer_start_state<decltype(pipeline_reduce_tma_store)>();
+
+    pipeline_init_wait(size(ClusterShape{}));
+
+    auto blk_coord = make_coord(_0{}, blockIdx.x, make_coord(blockIdx.y, blockIdx.z));
+    auto problem_shape = params.problem_shape;
+    int iter_count = ceil_div(get<0>(problem_shape), TileShapeQ{});
+    int iter_start = 0;
+    if constexpr (std::is_base_of_v<cutlass::fmha::collective::CausalMask, Mask>) {
+      iter_start = (get<1>(blk_coord) * TileShapeK{}) / TileShapeQ{};
+    }
+    iter_count -= iter_start;
+
+    if (role == WarpRole::Load) {
+      warpgroup_reg_set<RegisterAllocation::kLoad>();
+    
+      load(
+          blk_coord,
+          problem_shape,
+          iter_start,
+          iter_count,
+          params.mainloop,
+          params.mainloop_params,
+          shared_storage.tensors,
+          pipeline_load_mma_q, pipeline_load_mma_q_producer_state,        
+          pipeline_load_mma_do, pipeline_load_mma_do_producer_state,
+          pipeline_load_compute_lse, pipeline_load_compute_lse_producer_state,
+          pipeline_load_compute_sum_odo, pipeline_load_compute_sum_odo_producer_state
+      );
+
+    }
+    else if (role == WarpRole::Mma) {
+      warpgroup_reg_set<RegisterAllocation::kMma>();
+
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+    
+      mma(
+          blk_coord,
+          problem_shape,
+          iter_start,
+          iter_count,
+          params.mainloop,
+          shared_storage.tensors,
+          pipeline_load_mma_q, pipeline_load_mma_q_consumer_state,        
+          pipeline_load_mma_do, pipeline_load_mma_do_consumer_state,
+          pipeline_mma_compute_s, pipeline_mma_compute_s_producer_state,
+          pipeline_mma_compute_dp, pipeline_mma_compute_dp_producer_state,
+          pipeline_mma_reduce_dq, pipeline_mma_reduce_dq_producer_state,
+          pipeline_compute_mma_p, pipeline_compute_mma_p_consumer_state,
+          pipeline_compute_mma_ds, pipeline_compute_mma_ds_consumer_state,
+          pipeline_mma_compute_dkdv, pipeline_mma_compute_dkdv_producer_state
+      );
+
+    }
+    else if (role == WarpRole::Compute) {
+      warpgroup_reg_set<RegisterAllocation::kCompute>();
+    
+      compute(
+          blk_coord,
+          problem_shape,
+          iter_start,
+          iter_count,
+          params.mainloop,
+          params.epilogue,
+          shared_storage.tensors,
+          pipeline_load_compute_lse, pipeline_load_compute_lse_consumer_state,
+          pipeline_load_compute_sum_odo, pipeline_load_compute_sum_odo_consumer_state,
+          pipeline_mma_compute_s, pipeline_mma_compute_s_consumer_state,
+          pipeline_mma_compute_dp, pipeline_mma_compute_dp_consumer_state,
+          pipeline_compute_mma_p, pipeline_compute_mma_p_producer_state,
+          pipeline_compute_mma_ds, pipeline_compute_mma_ds_producer_state,
+          pipeline_mma_compute_dkdv, pipeline_mma_compute_dkdv_consumer_state
+      );
+
+      cutlass::arch::NamedBarrier(
+          kNumComputeWarps * NumThreadsPerWarp,
+          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
+      ).arrive_and_wait();
+
+      if (warp_idx % kNumComputeWarps == 0) {
+        uint32_t free_stage_ptr = shared_storage.tmem_base_ptr;
+        tmem_allocator.free(free_stage_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+      }
+
+    }
+    else if (role == WarpRole::Reduce) {
+      warpgroup_reg_set<RegisterAllocation::kReduce>();
+    
+      reduce(
+          blk_coord,
+          problem_shape,
+          iter_start,
+          iter_count,
+          params.mainloop,
+          params.mainloop_params,
+          shared_storage.tensors,
+          pipeline_mma_reduce_dq, pipeline_mma_reduce_dq_consumer_state,
+          pipeline_reduce_tma_store, pipeline_reduce_tma_store_producer_state
+      );
+
+      pipeline_reduce_tma_store.producer_tail(pipeline_reduce_tma_store_producer_state);
+    }
+    else {
+      warpgroup_reg_set<RegisterAllocation::kEmpty>();
+    
+      /* no-op */
+      
+    }
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(MaxThreadsPerBlock, 1, 1);
+    return block;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    auto [Q, K, D, HB] = params.problem_shape;
+    auto [H, B] = HB;
+    dim3 grid(ceil_div(K, TileShapeK{}), H, B);
+    return grid;
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
diff --git a/examples/77_blackwell_fmha/reference/fmha_bwd_reference.hpp b/examples/77_blackwell_fmha/reference/fmha_bwd_reference.hpp
new file mode 100644
index 00000000..bb8cfb34
--- /dev/null
+++ b/examples/77_blackwell_fmha/reference/fmha_bwd_reference.hpp
@@ -0,0 +1,311 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    class TensorDQ, /* class TensorDK, class TensorDV, */
+    class Fusion
+>
+void __global__ fmha_bwd_reference_dQ_kernel(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    TensorDQ mDQ, /* TensorDK mDK, TensorDV mDV, */
+    Fusion fusion) {
+
+  using namespace cute;
+
+  using Element = typename TensorO::value_type;
+  using ElementAccumulator = typename TensorLSE::value_type;
+  
+  extern __shared__ char mS_mem[];
+  Element* mS = reinterpret_cast<Element*>(mS_mem);
+
+  Element softmax_scale = static_cast<Element>(1.0 / sqrt(1.0 * size<1>(mO)));
+
+  for (int idx_L = blockIdx.y; idx_L < size<2>(mDQ); idx_L += gridDim.y) {
+    for (int idx_Q = blockIdx.x; idx_Q < size<0>(mDQ); idx_Q += gridDim.x) {
+      for (int idx_K = threadIdx.x; idx_K < size<0>(mK); idx_K += blockDim.x) {
+        ElementAccumulator acc_qk = 0;
+        ElementAccumulator acc_dov = 0;
+        ElementAccumulator acc_doo = 0;
+        for (int idx_D0 = 0; idx_D0 < size<1>(mK); idx_D0++) {
+          acc_qk += mQ(idx_Q, idx_D0, idx_L) * mK(idx_K, idx_D0, idx_L);
+          acc_dov += mDO(idx_Q, idx_D0, idx_L) * mV(idx_K, idx_D0, idx_L);
+          acc_doo += mDO(idx_Q, idx_D0, idx_L) * mO(idx_Q, idx_D0, idx_L);
+        }  // for idx_D0
+
+        auto id = make_identity_tensor(make_shape(1, 1));
+        auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
+        frag(0) = acc_qk;
+        fusion.apply_mask(frag, make_tensor(id.data() + make_arithmetic_tuple(idx_Q, idx_K), id.layout()), problem_shape);
+        acc_qk = frag(0);
+
+        mS[idx_K] = static_cast<Element>(exp(softmax_scale * acc_qk - mLSE(idx_Q, idx_L)) * softmax_scale * (acc_dov - acc_doo));
+      }  // for idx_K
+
+      __syncthreads();
+
+      for (int idx_D = threadIdx.x; idx_D < size<1>(mDQ); idx_D += blockDim.x) {
+        ElementAccumulator acc = 0;
+        for (int idx_K = 0; idx_K < size<0>(mK); idx_K++) {
+          acc += mS[idx_K] * mK(idx_K, idx_D, idx_L);
+        }
+        mDQ(idx_Q, idx_D, idx_L) = static_cast<typename TensorDQ::value_type>(acc);
+      }  // for idx_D
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    /* class TensorDQ, */ class TensorDK, /* class TensorDV, */
+    class Fusion
+>
+void __global__ fmha_bwd_reference_dK_kernel(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    /* TensorDQ mDQ, */ TensorDK mDK, /* TensorDV mDV, */
+    Fusion fusion) {
+
+  using namespace cute;
+
+  using Element = typename TensorO::value_type;
+  using ElementAccumulator = typename TensorLSE::value_type;
+  
+  extern __shared__ char mS_mem[];
+  Element* mS = reinterpret_cast<Element*>(mS_mem);
+
+  Element softmax_scale = static_cast<Element>(1.0 / sqrt(1.0 * size<1>(mO)));
+
+  for (int idx_L = blockIdx.y; idx_L < size<2>(mDK); idx_L += gridDim.y) {
+    for (int idx_K = blockIdx.x; idx_K < size<0>(mDK); idx_K += gridDim.x) {
+      for (int idx_Q = threadIdx.x; idx_Q < size<0>(mDO); idx_Q += blockDim.x) {
+        ElementAccumulator acc_qk = 0;
+        ElementAccumulator acc_dov = 0;
+        ElementAccumulator acc_doo = 0;
+        for (int idx_D0 = 0; idx_D0 < size<1>(mK); idx_D0++) {
+          acc_qk += mQ(idx_Q, idx_D0, idx_L) * mK(idx_K, idx_D0, idx_L);
+          acc_dov += mDO(idx_Q, idx_D0, idx_L) * mV(idx_K, idx_D0, idx_L);
+          acc_doo += mDO(idx_Q, idx_D0, idx_L) * mO(idx_Q, idx_D0, idx_L);
+        }  // for idx_D0
+        
+        auto id = make_identity_tensor(make_shape(1, 1));
+        auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
+        frag(0) = acc_qk;
+        fusion.apply_mask(frag, make_tensor(id.data() + make_arithmetic_tuple(idx_Q, idx_K), id.layout()), problem_shape);
+        acc_qk = frag(0);
+
+        mS[idx_Q] = static_cast<Element>(exp(softmax_scale * acc_qk - mLSE(idx_Q, idx_L)) * softmax_scale * (acc_dov - acc_doo));
+      }  // for idx_Q
+
+      __syncthreads();
+
+      for (int idx_D = threadIdx.x; idx_D < size<1>(mDK); idx_D += blockDim.x) {
+        ElementAccumulator acc = 0;
+        for (int idx_Q = 0; idx_Q < size<0>(mDO); idx_Q++) {
+          acc += mS[idx_Q] * mQ(idx_Q, idx_D, idx_L);
+        }
+        mDK(idx_K, idx_D, idx_L) = static_cast<typename TensorDK::value_type>(acc);
+      }  // for idx_D
+    }  // for idx_K
+  }  // for idx_L
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    /* class TensorDQ, class TensorDK, */ class TensorDV,
+    class Fusion
+>
+void __global__ fmha_bwd_reference_dV_kernel(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    /* TensorDQ mDQ, TensorDK mDK, */ TensorDV mDV,
+    Fusion fusion) {
+
+  using namespace cute;
+
+  using Element = typename TensorO::value_type;
+  using ElementAcc = typename TensorLSE::value_type;
+  
+  extern __shared__ char mS_mem[];
+  Element* mS = reinterpret_cast<Element*>(mS_mem);
+
+  ElementAcc softmax_scale = static_cast<ElementAcc>(1.0 / sqrt(1.0 * size<1>(mO)));
+
+  for (int idx_L = blockIdx.y; idx_L < size<2>(mDV); idx_L += gridDim.y) {
+    for (int idx_K = blockIdx.x; idx_K < size<0>(mDV); idx_K += gridDim.x) {
+      for (int idx_Q = threadIdx.x; idx_Q < size<0>(mDO); idx_Q += blockDim.x) {
+        ElementAcc acc_qk = 0;
+
+        for (int idx_D0 = 0; idx_D0 < size<1>(mK); idx_D0++) {
+          ElementAcc rQ = mQ(idx_Q, idx_D0, idx_L);
+          ElementAcc rK = mK(idx_K, idx_D0, idx_L);
+          acc_qk += rQ * rK;
+        }  // for idx_D0
+
+        auto id = make_identity_tensor(make_shape(1, 1));
+        auto frag = make_tensor<ElementAcc>(Shape<_1, _1>{});
+        frag(0) = acc_qk;
+        fusion.apply_mask(frag, make_tensor(id.data() + make_arithmetic_tuple(idx_Q, idx_K), id.layout()), problem_shape);
+        acc_qk = frag(0);
+
+        mS[idx_Q] = static_cast<Element>(exp(softmax_scale * acc_qk - mLSE(idx_Q, idx_L)));
+      }  // for idx_Q
+
+      __syncthreads();
+
+      for (int idx_D = threadIdx.x; idx_D < size<1>(mDV); idx_D += blockDim.x) {
+        ElementAcc acc = 0;
+        for (int idx_Q = 0; idx_Q < size<0>(mDO); idx_Q++) {
+          ElementAcc rS = mS[idx_Q];
+          ElementAcc rDO = mDO(idx_Q, idx_D, idx_L);
+          acc += rS * rDO;
+        }
+        mDV(idx_K, idx_D, idx_L) = static_cast<typename TensorDV::value_type>(acc);
+      }  // for idx_D
+    }  // for idx_K
+  }  // for idx_L
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    /**/ class TensorDQ, /** / class TensorDK, / ** / class TensorDV, / **/
+    class Fusion
+>
+void fmha_bwd_reference_dQ(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    /**/ TensorDQ mDQ, /** / TensorDK mDK, / ** / TensorDV mDV, / **/
+    Fusion fusion) {
+
+  using namespace cute;
+
+  dim3 grid(size<0>(mDQ), size<2>(mDQ), 1);
+  dim3 block(256);
+  int shared_mem = size<0>(mK) * sizeof(typename TensorO::value_type);
+  fmha_bwd_reference_dQ_kernel<<<grid, block, shared_mem>>>(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDQ, fusion);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    /** / class TensorDQ, / **/ class TensorDK, /** / class TensorDV, / **/
+    class Fusion
+>
+void fmha_bwd_reference_dK(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    /** / TensorDQ mDQ, / **/ TensorDK mDK, /** / TensorDV mDV, / **/
+    Fusion fusion) {
+
+  using namespace cute;
+
+  dim3 grid(size<0>(mDK), size<2>(mDK), 1);
+  dim3 block(256);
+  int shared_mem = size<0>(mDO) * sizeof(typename TensorO::value_type);
+  fmha_bwd_reference_dK_kernel<<<grid, block, shared_mem>>>(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDK, fusion);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    /** / class TensorDQ, / ** / class TensorDK, / **/ class TensorDV, /**/
+    class Fusion
+>
+void fmha_bwd_reference_dV(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    /** / TensorDQ mDQ, / ** / TensorDK mDK, / **/ TensorDV mDV, /**/
+    Fusion fusion) {
+
+  using namespace cute;
+
+  dim3 grid(size<0>(mDV), size<2>(mDV), 1);
+  dim3 block(256);
+  int shared_mem = size<0>(mDO) * sizeof(typename TensorO::value_type);
+  fmha_bwd_reference_dV_kernel<<<grid, block, shared_mem>>>(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDV, fusion);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class ProblemShape,
+    class TensorQ, class TensorK, class TensorV,
+    class TensorO, class TensorLSE, class TensorDO,
+    class TensorDQ, class TensorDK, class TensorDV,
+    class Fusion
+>
+void fmha_bwd_reference(
+    ProblemShape problem_shape,
+    TensorQ mQ, TensorK mK, TensorV mV,
+    TensorO mO, TensorLSE mLSE, TensorDO mDO,
+    TensorDQ mDQ, TensorDK mDK, TensorDV mDV,
+    Fusion fusion) {
+
+  fmha_bwd_reference_dQ(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDQ, fusion);
+  fmha_bwd_reference_dK(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDK, fusion);
+  fmha_bwd_reference_dV(problem_shape, mQ, mK, mV, mO, mLSE, mDO, mDV, fusion);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/77_blackwell_fmha/reference/fmha_fwd_reference.hpp b/examples/77_blackwell_fmha/reference/fmha_fwd_reference.hpp
index 48d81101..b7c6b412 100644
--- a/examples/77_blackwell_fmha/reference/fmha_fwd_reference.hpp
+++ b/examples/77_blackwell_fmha/reference/fmha_fwd_reference.hpp
@@ -128,7 +128,7 @@ void __global__ fmha_reference_kernel(
       }
 
       if (threadIdx.x == 0) {
-        mLSE(idx_Q + offset_Q, idx_L) = log(sum) + maxS;
+        mLSE(idx_Q + offset_Q, idx_L) = log(sum) + softmax_scale * maxS;
       }
 
     }
diff --git a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
new file mode 100644
index 00000000..d36bf4dd
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
@@ -0,0 +1,927 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+    \brief Grouped GEMM example using CUTLASS 3x APIs for the NVIDIA Blackwell SM120 architecture.
+
+    This example demonstrates an implementation of Grouped GEMM using a TMA + Blackwell SM120 TensorOp-based warp-specialized kernel
+    for narrow precisions (FP4) with input Scale Factors.
+    For this example all scheduling work is performed on the device, utilizing the device-side modification of TMA descriptors
+    to move between groups/problem_count (represented by groups).
+    https://docs.nvidia.com/cuda/cuda-c-programming-guide/#encoding-a-tensor-map-on-device
+
+    To run this example:
+
+      $ ./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm --m=2048 --n=2048 --k=2048 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Skipping any of the problem dimensions randomizes it across the different groups.
+      Same applies for alpha and beta values that are randomized across the different groups.
+
+    To run this example for a set of problems using the benchmark option:
+
+      $ ./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm --benchmark=./test_benchmark.txt
+
+      Where the test_benchmark.txt may look as such:
+        0 256x512x128
+        1 256x512x512
+        2 512x256x128
+        3 256x256x128
+        4 256x512x1024
+        5 1024x512x128 and so on
+*/
+
+#include <iostream>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <float.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "helper.h"
+using namespace cute;
+
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
+using ElementInput = cutlass::float_e2m1_t;                                // Element type for Input matrix operands
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// A matrix configuration
+using         ElementA    = cutlass::nv_float4_t<ElementInput>;             // Element type for A matrix operand
+using         LayoutATag  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 32;                                             // Alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::nv_float4_t<ElementInput>;             // Element type for B matrix operand
+using         LayoutBTag  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 32;                                             // Alignment of A matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementD    = float_e2m1_t;                                   // Element type for D matrix operands
+using         ElementSFD  = cutlass::float_ue4m3_t;                         // Element type for SF Output operands
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutCTag  = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+using         LayoutDTag  = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+using         LayoutSFDTag = LayoutDTag;                                    // Layout type for SFD should be same as D matrix operand
+
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Alignment of D matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for internal computation
+using ArchTag             = cutlass::arch::Sm120;                           // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Epilogue Operator class tag
+
+// Kernel Perf config
+// Cluster Shape fixed to 1x1x1
+using ThreadBlockShape    = Shape<_128,_128,_128>;
+using ClusterShape        = Shape<_1,_1,_1>;
+constexpr int OutputSFVectorSize = 16;
+
+// D = alpha * acc + beta * C
+// With BlockScaleFactor generation.
+using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    OutputSFVectorSize,
+    ElementD, 
+    ElementCompute, 
+    ElementSFD, LayoutCTag,
+    ElementC>;
+
+// Cooperative kernel schedule
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ThreadBlockShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutCTag *, AlignmentC,
+    ElementD, LayoutCTag *, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto,
+    FusionOperation
+>::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+  ArchTag, OperatorClass,
+  ElementA, LayoutATag *, AlignmentA,
+  ElementB, LayoutBTag *, AlignmentB,
+  ElementAccumulator,
+  ThreadBlockShape, ClusterShape,
+  cutlass::gemm::collective::StageCountAutoCarveout<
+  static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+  cutlass::gemm::collective::KernelScheduleAuto                             // Auto schedule defaults to cooperative schedule
+>::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+
+// Pingpong kernel schedule
+using CollectiveMainloopPingpong = typename cutlass::gemm::collective::CollectiveBuilder<
+  ArchTag, OperatorClass,
+  ElementA, LayoutATag *, AlignmentA,
+  ElementB, LayoutBTag *, AlignmentB,
+  ElementAccumulator,
+  ThreadBlockShape, ClusterShape,
+  cutlass::gemm::collective::StageCountAutoCarveout<
+  static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+  cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong
+>::CollectiveOp;
+
+using GemmKernelPingpong = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloopPingpong,
+    CollectiveEpilogue
+>;
+
+using GemmPingpong = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelPingpong>;
+
+using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
+                                        OutputSFVectorSize, 
+                                        cute::is_same_v<typename FusionOperation::GmemLayoutTagScalefactor,
+                                            cutlass::layout::RowMajor> ? cute::UMMA::Major::K : cute::UMMA::Major::MN
+                                     >;
+using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
+
+// Host-side allocations
+std::vector<StrideA> stride_A_host;
+std::vector<StrideB> stride_B_host;
+std::vector<LayoutSFA> layout_SFA_host;
+std::vector<LayoutSFA> layout_SFB_host;
+std::vector<StrideC> stride_C_host;
+std::vector<StrideD> stride_D_host;
+
+std::vector<ElementAccumulator> alpha_host;
+std::vector<ElementAccumulator> beta_host;
+
+using HostTensorA = cutlass::HostTensor<typename Gemm::ElementA, cutlass::layout::PackedVectorLayout>;
+using HostTensorB = cutlass::HostTensor<typename Gemm::ElementB, cutlass::layout::PackedVectorLayout>;
+using HostTensorSF = cutlass::HostTensor<typename Gemm::GemmKernel::CollectiveMainloop::ElementSF, cutlass::layout::PackedVectorLayout>;
+using HostTensorC = cutlass::HostTensor<typename Gemm::ElementC, cutlass::layout::PackedVectorLayout>;
+using HostTensorD = cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementOutput, cutlass::layout::PackedVectorLayout>;
+std::vector<HostTensorA> block_A;
+std::vector<HostTensorB> block_B;
+std::vector<HostTensorSF> block_SFA;
+std::vector<HostTensorSF> block_SFB;
+std::vector<HostTensorC> block_C;
+std::vector<HostTensorD> block_D;
+std::vector<HostTensorSF> block_SFD;
+std::vector<HostTensorD> block_ref_D;
+std::vector<HostTensorSF> block_ref_SFD;
+
+// Device-side allocations
+cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
+
+cutlass::DeviceAllocation<const typename Gemm::ElementA *> ptr_A;
+cutlass::DeviceAllocation<const typename Gemm::ElementB *> ptr_B;
+cutlass::DeviceAllocation<const typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFA;
+cutlass::DeviceAllocation<const typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFB;
+cutlass::DeviceAllocation<const typename Gemm::ElementC *> ptr_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_D;
+cutlass::DeviceAllocation<typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFD;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_ref_D;
+
+cutlass::DeviceAllocation<StrideA> stride_A;
+cutlass::DeviceAllocation<StrideB> stride_B;
+cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
+cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
+cutlass::DeviceAllocation<StrideC> stride_C;
+cutlass::DeviceAllocation<StrideD> stride_D;
+
+// Note, this is an array of pointers to alpha and beta scaling values per group
+cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
+cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
+cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
+cutlass::DeviceAllocation<ElementAccumulator> block_beta;
+// A matrix wide constant value to scale the output matrix
+// Avoids generating small FP4 values.
+// NormConst is a single device-side constant value, its not per-batch or per-group
+cutlass::DeviceAllocation<ElementAccumulator> norm_constant_device;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm100GroupParams<typename ProblemShape::UnderlyingProblemShape>::RasterOrderOptions;
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+  bool verification = true;
+  bool use_pdl = false;
+
+  float alpha = std::numeric_limits<float>::max();
+  float beta  = std::numeric_limits<float>::max();
+  float norm_constant = 1.0;
+  int iterations = 10;
+  int m = 1024, n = 2048, k = 512, groups = 10;
+  RasterOrderOptions raster_order = RasterOrderOptions::AlongN;
+  int max_sm_count = INT_MAX;
+  std::string benchmark_path;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
+  int const tma_alignment_bits = 128;
+  int const alignment = tma_alignment_bits / cutlass::sizeof_bits<ElementInput>::value;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+    if (cmd.check_cmd_line_flag("no_verif")) {
+      verification = false;
+    }
+    if (cmd.check_cmd_line_flag("use_pdl")) {
+      use_pdl = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("groups", groups);
+    cmd.get_cmd_line_argument("alpha", alpha, std::numeric_limits<float>::max());
+    cmd.get_cmd_line_argument("beta",  beta,  std::numeric_limits<float>::max());
+    cmd.get_cmd_line_argument("norm_constant",  norm_constant,  float(1.0));
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("benchmark", benchmark_path);
+    cmd.get_cmd_line_argument("max_sm_count", max_sm_count, INT_MAX);
+
+    // Decide how to initialize the problems
+    if (!benchmark_path.empty()) {
+      if (!benchmark_problems()) {
+        problem_sizes_host.clear();
+        return;
+      }
+    }
+    else {
+      randomize_problems(cmd);
+    }
+
+    char raster_char;
+    cmd.get_cmd_line_argument("raster", raster_char);
+
+    if (raster_char == 'N' || raster_char == 'n') {
+      raster_order = RasterOrderOptions::AlongN;
+    }
+    else if (raster_char == 'M' || raster_char == 'm') {
+      raster_order = RasterOrderOptions::AlongM;
+    }
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+    int cmd_line_m = -1, cmd_line_n = -1, cmd_line_k = -1;
+    cmd.get_cmd_line_argument("m", cmd_line_m);
+    cmd.get_cmd_line_argument("n", cmd_line_n);
+    cmd.get_cmd_line_argument("k", cmd_line_k);
+
+    problem_sizes_host.reserve(groups);
+
+    for (int i = groups; i > 0; i--) {
+      int m = cmd_line_m;
+      int n = cmd_line_n;
+      int k = cmd_line_k;
+      if (m < 1) {
+        m = alignment * ((rand() % 64) + 1);
+      }
+      if (n < 1) {
+        n = alignment * ((rand() % 64) + 1);
+      }
+      if (k < 1) {
+        k = alignment * ((rand() % 64) + 1);
+      }
+      problem_sizes_host.push_back({m, n, k});
+    }
+  }
+
+  /// Load a benchmark
+  bool benchmark_problems() {
+    std::ifstream file(benchmark_path);
+    if (!file.good()) {
+      return false;
+    }
+
+    while (file.good()) {
+
+      int idx = -1;
+      std::string extent_str;
+
+      file >> idx >> extent_str;
+
+      if (idx < 0 || extent_str.empty()) {
+        break;
+      }
+
+      cutlass::gemm::GemmCoord extent;
+      std::vector<std::string> tokens;
+
+      cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
+
+      for (int i = 0; i < int(tokens.size()); ++i) {
+        int x = std::atoi(tokens.at(i).c_str());
+
+        // round up
+        if (x % alignment) {
+          x += (alignment - (x % alignment));
+        }
+
+        extent.at(i) = x;
+      }
+
+      if (extent.product()) {
+        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
+      }
+    }
+    groups = static_cast<int>(problem_sizes_host.size());
+
+    return true;
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "79d_blackwell_geforce_nvfp4_grouped_gemm\n\n"
+      << "  Blackwell Block Scaled Narrow Precision Grouped GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                                                       If specified, displays this usage statement\n\n"
+      << "  --m=<int>                                                    Sets the M extent of the GEMM for all groups\n"
+      << "  --n=<int>                                                    Sets the N extent of the GEMM for all groups\n"
+      << "  --k=<int>                                                    Sets the K extent of the GEMM for all groups\n"
+      << "  --groups=<int>                                               Sets the number of individual GEMM problems for Grouped GEMM\n"
+      << "  --alpha=<f32>                                                Epilogue scalar alpha\n"
+      << "  --beta=<f32>                                                 Epilogue scalar beta\n"
+      << "  --norm_constant=<f32>                                        Epilogue scalar normalization constant for the output matrix\n\n"
+      << "  --raster=<char>                                              CTA Rasterization direction (N for along N, M for along M)\n\n"
+      << "  --iterations=<int>                                           Number of profiling iterations to perform\n\n"
+      << "  --benchmark=<str>                                            Executes a benchmark problem size\n"
+      << "  --max_sm_count=<int>                                         Run kernels using only these number of SMs\n"
+      << "  --no_verif                                                   Do not run (host-side) verification kernels\n"
+      << "  --use_pdl                                                    Launch kernel with PDL (Programmatic Dependent Launch) enabled\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "79d_blackwell_geforce_nvfp4_grouped_gemm" << " --m=1024 --n=512 --k=1024 --groups=10 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s, std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host) const
+  {
+    // Number of real-valued multiply-adds
+    uint64_t fmas = uint64_t();
+
+    for (auto const & problem : problem_sizes_host) {
+      fmas += static_cast<uint64_t>(get<0>(problem)) *
+              static_cast<uint64_t>(get<1>(problem)) *
+              static_cast<uint64_t>(get<2>(problem));
+    }
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * uint64_t(fmas);
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms = 0.0;
+  double gflops = 0.0;
+  cutlass::Status status = cutlass::Status::kSuccess;
+  cudaError_t error = cudaSuccess;
+  bool passed = false;
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_block(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if constexpr (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if constexpr (bits_input <= 6) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if constexpr (bits_input <= 8) {
+    if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
+      scope_max = 4;
+      scope_min = 1;
+    }
+    else {
+      scope_max = 1;
+      scope_min = -1;
+    }
+  }
+  else{
+    scope_max = 4;
+    scope_min = -4;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+  
+  return true;
+}
+
+/// Allocates device-side data
+void allocate(const Options &options) {
+  for (int32_t i = 0; i < options.groups; ++i) {
+    auto problem = options.problem_sizes_host.at(i);
+    auto M = get<0>(problem);
+    auto N = get<1>(problem);
+    auto K = get<2>(problem);
+
+    auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+    auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+    auto stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
+    auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
+
+    auto layout_A = make_layout(make_shape(M, K, 1), stride_A);
+    auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
+    auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
+    auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
+    auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+    auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+    auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+
+    stride_A_host.push_back(stride_A);
+    stride_B_host.push_back(stride_B);
+    layout_SFA_host.push_back(layout_SFA);
+    layout_SFB_host.push_back(layout_SFB);
+    stride_C_host.push_back(stride_C);
+    stride_D_host.push_back(stride_D);
+
+    block_A.push_back(HostTensorA(cutlass::make_Coord(size(layout_A))));
+    block_B.push_back(HostTensorB(cutlass::make_Coord(size(layout_B))));
+    block_SFA.push_back(HostTensorSF(cutlass::make_Coord(size(filter_zeros(layout_SFA)))));
+    block_SFB.push_back(HostTensorSF(cutlass::make_Coord(size(filter_zeros(layout_SFB)))));
+    block_C.push_back(HostTensorC(cutlass::make_Coord(size(layout_C))));
+    block_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
+    block_SFD.push_back(HostTensorSF(cutlass::make_Coord(size(filter_zeros(layout_SFD)))));
+    block_ref_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
+    block_ref_SFD.push_back(HostTensorSF(cutlass::make_Coord(size(filter_zeros(layout_SFD)))));
+  }
+  block_alpha.reset(options.groups);
+  block_beta.reset(options.groups);
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  uint64_t seed = 2020;
+  problem_sizes.reset(options.groups);
+  problem_sizes.copy_from_host(options.problem_sizes_host.data());
+
+  //
+  // Assign pointers
+  //
+
+  std::vector<typename Gemm::ElementA *> ptr_A_host(options.groups);
+  std::vector<typename Gemm::ElementB *> ptr_B_host(options.groups);
+  std::vector<typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFA_host(options.groups);
+  std::vector<typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFB_host(options.groups);
+  std::vector<typename Gemm::ElementC *> ptr_C_host(options.groups);
+  std::vector<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_D_host(options.groups);
+  std::vector<typename Gemm::GemmKernel::CollectiveMainloop::ElementSF *> ptr_SFD_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
+
+  for (int32_t i = 0; i < options.groups; ++i) {
+
+    initialize_block(block_A.at(i).host_view(), seed + 2021);
+    initialize_block(block_B.at(i).host_view(), seed + 2022);
+    initialize_block(block_C.at(i).host_view(), seed + 2023);
+    initialize_block(block_SFA.at(i).host_view(), seed + 2024);
+    initialize_block(block_SFB.at(i).host_view(), seed + 2025);
+
+    block_A.at(i).sync_device();
+    block_B.at(i).sync_device();
+    block_C.at(i).sync_device();
+    block_SFA.at(i).sync_device();
+    block_SFB.at(i).sync_device();
+
+    ptr_A_host.at(i) = block_A.at(i).device_data();
+    ptr_B_host.at(i) = block_B.at(i).device_data();
+    ptr_SFA_host.at(i) = block_SFA.at(i).device_data();
+    ptr_SFB_host.at(i) = block_SFB.at(i).device_data();
+    ptr_C_host.at(i) = block_C.at(i).device_data();
+    ptr_D_host.at(i) = block_D.at(i).device_data();
+    ptr_SFD_host.at(i) = block_SFD.at(i).device_data();
+
+    alpha_host.push_back((options.alpha == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
+    beta_host.push_back((options.beta == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
+    ptr_alpha_host.at(i) = block_alpha.get() + i;
+    ptr_beta_host.at(i) = block_beta.get() + i;
+  }
+
+  ptr_A.reset(options.groups);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  ptr_B.reset(options.groups);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  ptr_SFA.reset(options.groups);
+  ptr_SFA.copy_from_host(ptr_SFA_host.data());
+
+  ptr_SFB.reset(options.groups);
+  ptr_SFB.copy_from_host(ptr_SFB_host.data());
+
+  ptr_C.reset(options.groups);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  ptr_D.reset(options.groups);
+  ptr_D.copy_from_host(ptr_D_host.data());
+
+  ptr_SFD.reset(options.groups);
+  ptr_SFD.copy_from_host(ptr_SFD_host.data());
+
+  stride_A.reset(options.groups);
+  stride_A.copy_from_host(stride_A_host.data());
+
+  stride_B.reset(options.groups);
+  stride_B.copy_from_host(stride_B_host.data());
+
+  layout_SFA.reset(options.groups);
+  layout_SFA.copy_from_host(layout_SFA_host.data());
+
+  layout_SFB.reset(options.groups);
+  layout_SFB.copy_from_host(layout_SFB_host.data());
+
+  stride_C.reset(options.groups);
+  stride_C.copy_from_host(stride_C_host.data());
+
+  stride_D.reset(options.groups);
+  stride_D.copy_from_host(stride_D_host.data());
+
+  alpha_device.reset(options.groups);
+  alpha_device.copy_from_host(ptr_alpha_host.data());
+  beta_device.reset(options.groups);
+  beta_device.copy_from_host(ptr_beta_host.data());
+
+  block_alpha.copy_from_host(alpha_host.data());
+  block_beta.copy_from_host(beta_host.data());
+
+  norm_constant_device.reset(1);
+  norm_constant_device.copy_from_host(&options.norm_constant);
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <typename Gemm>
+typename Gemm::Arguments args_from_options(Options &options, bool host_problem_shapes_available = true)
+{
+  cutlass::KernelHardwareInfo hw_info;
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = min(cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id), options.max_sm_count);
+
+  typename Gemm::Arguments arguments;
+  decltype(arguments.epilogue.thread) fusion_args;
+  fusion_args.alpha_ptr = nullptr;
+  fusion_args.beta_ptr = nullptr;
+
+  // If alpha/beta are provided (via cmd line args) and are scalar, i.e., same alpha/beta applies to all batches.
+  // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups.
+  if (options.alpha != std::numeric_limits<float>::max()){
+    // Single alpha for all groups
+    fusion_args.alpha = options.alpha;
+    fusion_args.alpha_ptr_array = nullptr;
+    fusion_args.dAlpha = {_0{}, _0{}, 0};
+  }
+  else {
+    fusion_args.alpha = 0;
+    fusion_args.alpha_ptr_array = alpha_device.get();
+    // Only one alpha per each group
+    fusion_args.dAlpha = {_0{}, _0{}, 1};
+  }
+  if (options.beta != std::numeric_limits<float>::max()) {
+    // Single beta for all groups
+    fusion_args.beta = options.beta;
+    fusion_args.beta_ptr_array = nullptr;
+    fusion_args.dBeta = {_0{}, _0{}, 0};
+  }
+  else {
+    fusion_args.beta = 0;
+    fusion_args.beta_ptr_array = beta_device.get();
+    // Only one beta per each group
+    fusion_args.dBeta = {_0{}, _0{}, 1};
+  }
+
+  // Output Block SF
+  fusion_args.block_scale_factor_ptr = ptr_SFD.get();          // Enable for SF Output
+  fusion_args.norm_constant_ptr = norm_constant_device.get();  // Enable for SF Output
+
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = options.raster_order;
+
+  if (host_problem_shapes_available) {
+    arguments = typename Gemm::Arguments {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
+      {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get(),
+       ptr_SFA.get(), layout_SFA.get(), ptr_SFB.get(), layout_SFB.get()},
+      {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
+      hw_info, scheduler
+    };
+  }
+  else {
+    arguments = typename Gemm::Arguments {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {options.groups, problem_sizes.get(), nullptr},
+      {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get(),
+       ptr_SFA.get(), layout_SFA.get(), ptr_SFB.get(), layout_SFB.get()},
+      {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
+      hw_info, scheduler
+    };
+  }
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  using namespace cute;
+  bool passed = true;
+  for (int32_t i = 0; i < options.groups; ++i) {
+    auto problem = options.problem_sizes_host.at(i);
+    auto M = get<0>(problem);
+    auto N = get<1>(problem);
+    auto K = get<2>(problem);
+
+    auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+    auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+    auto stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
+    auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
+    auto layout_A = make_layout(make_shape(M, K, 1), stride_A);
+    auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
+    auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
+    auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
+    auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+    auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+    auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+
+    // Create the arguments for host reference implementation
+    Tensor tensor_A = make_tensor(make_iterator(block_A.at(i).host_data()), layout_A);
+    Tensor tensor_SFA = make_tensor(block_SFA.at(i).host_data(), layout_SFA);
+    Tensor tensor_B = make_tensor(make_iterator(block_B.at(i).host_data()), layout_B);
+    Tensor tensor_SFB = make_tensor(block_SFB.at(i).host_data(), layout_SFB);
+    cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator,
+        decltype(tensor_A),
+        decltype(tensor_SFA),
+        decltype(tensor_B),
+        decltype(tensor_SFB)
+      > 
+    mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+  
+    auto tensor_C = cute::make_tensor(make_iterator(block_C.at(i).host_data()), layout_C);
+    auto tensor_ref_D = cute::make_tensor(make_iterator(block_ref_D.at(i).host_data()), layout_D);
+    auto tensor_ref_SFD = cute::make_tensor(make_iterator(block_ref_SFD.at(i).host_data()), layout_SFD);
+
+    cutlass::reference::host::GettBlockScalingEpilogueParams<
+        ElementCompute,                       // ElementScalar
+        ElementAccumulator,                   // ElementAccumulator
+        ElementCompute,                       // ElementCompute
+        decltype(tensor_C),                   // TensorC
+        decltype(tensor_ref_D),               // TensorD
+        decltype(tensor_ref_SFD),             // TensorSfD
+        cute::Int<OutputSFVectorSize>,
+        cutlass::reference::host::SfStrategy::SfDGen
+      > epilogue_params {alpha_host.at(i), beta_host.at(i), tensor_C, tensor_ref_D, tensor_ref_SFD, options.norm_constant};
+    
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    // Comparison
+    block_D.at(i).sync_host();
+    block_SFD.at(i).sync_host();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    passed &= cutlass::reference::host::TensorEquals(block_ref_D.at(i).host_view(), block_D.at(i).host_view());
+    passed &= cutlass::reference::host::TensorEquals(block_ref_SFD.at(i).host_view(), block_SFD.at(i).host_view());
+    // Check that the tensors have non-zero norms
+    passed &= (cutlass::reference::host::TensorNorm(block_ref_D.at(i).host_view()) > 0);
+    passed &= (cutlass::reference::host::TensorNorm(block_D.at(i).host_view()) > 0);
+    passed &= (cutlass::reference::host::TensorNorm(block_ref_SFD.at(i).host_view()) > 0);
+    passed &= (cutlass::reference::host::TensorNorm(block_SFD.at(i).host_view()) > 0);
+  }
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options, bool host_problem_shapes_available = true)
+{
+  std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
+  for (int32_t i = 0; i < options.groups; ++i) {
+    std::cout << "    " << options.problem_sizes_host.at(i);
+    std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl;
+  }
+  std::cout << "  Groups      : " << options.groups  << std::endl;
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<Gemm>(options, host_problem_shapes_available);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
+
+  cudaDeviceSynchronize();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  if (options.verification) {
+    std::cout << "  Host-side verification is now running - may be very slow for large cases." << std::endl;
+    result.passed = verify(options);
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+    if (!result.passed) {
+      exit(-1);
+    }
+  }
+  else {
+    std::cout << "  Verfication is turned off for this run." << std::endl;
+  } 
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
+    }
+    timer.stop();
+
+    // Compute average setup and runtime and GFLOPs.
+    float elapsed_ms       = timer.elapsed_millis();
+    result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
+    result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
+
+    std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
+  if (__CUDACC_VER_MAJOR__ < 12 ||
+       ((__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)
+       )
+     ) {
+    std::cerr << "This example requires CUDA 12.8 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (!(props.major == 12 && props.minor == 0)) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120a).\n";
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  allocate(options);
+  initialize(options);
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+  std::cout << "Running kernel with Cooperative kernel schedule:" << std::endl;
+  run<Gemm>(options, false /*host_problem_shapes_available*/);
+  std::cout << "Running kernel with Pingpong kernel schedule:" << std::endl;
+  run<GemmPingpong>(options, false /*host_problem_shapes_available*/); 
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/CMakeLists.txt b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
index cb7e3e97..b689c85e 100644
--- a/examples/79_blackwell_geforce_gemm/CMakeLists.txt
+++ b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
@@ -28,6 +28,24 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+set(TEST_RANDOM --iterations=0)                                                     # Random problem sizes
+set(TEST_RANDOM_LARGE_GROUP --groups=50 --iterations=0)                             # Random problem sizes
+
+set(TEST_EPILOGUE --alpha=0.5 --beta=0.5 --iterations=0)                            # Random problem sizes
+set(TEST_EPILOGUE_LARGE_GROUP --alpha=1.5 --beta=2.0 --groups=50 --iterations=0)    # Random problem sizes
+
+set(TEST_EPILOGUE_OP --beta=0.5 --iterations=1)                                     # Random problem sizes
+set(TEST_EPILOGUE_OP_LARGE_GROUP --alpha=1.5 --iterations=1)                        # Random problem sizes
+
+set(TEST_FIXED --m=2048 --n=5120 --k=8192 --iterations=0)                           # Fixed problem sizes
+set(TEST_FIXED_LARGE_GROUP --m=2048 --n=512 --k=512 --groups=51 --iterations=0)     # Fixed problem sizes
+
+set(TEST_SMALL --m=256 --n=128 --iterations=0)                                      # Small problem sizes
+set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)              # Small problem sizes
+
+set(TEST_RANDOM_PERF --iterations=10)                                               # Random problem sizes
+set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10)                       # Random problem sizes
+
 if (CUTLASS_NVCC_ARCHS MATCHES 120a)
 cutlass_example_add_executable(
   79a_blackwell_geforce_nvfp4_bf16_gemm
@@ -44,4 +62,22 @@ cutlass_example_add_executable(
   79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
 )  
 
+cutlass_example_add_executable(
+  79d_blackwell_geforce_nvfp4_grouped_gemm
+  79d_blackwell_geforce_nvfp4_grouped_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_RANDOM
+  TEST_RANDOM_LARGE_GROUP
+  TEST_EPILOGUE
+  TEST_EPILOGUE_LARGE_GROUP
+  TEST_EPILOGUE_OP
+  TEST_EPILOGUE_OP_LARGE_GROUP
+  TEST_FIXED
+  TEST_FIXED_LARGE_GROUP
+  TEST_SMALL
+  TEST_SMALL_LARGE_GROUP
+  TEST_RANDOM_PERF
+  TEST_RANDOM_PERF_LARGE_GROUP
+)  
+
 endif()
diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
index 3f73140a..a11fb17c 100644
--- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@@ -61,7 +61,8 @@
 #include <cute/tensor.hpp>                      // CuTe tensor implementation
 #include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
 #include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
-#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_copy.hpp>  // Auto vectorized copy operation
+#include <cute/arch/tmem_allocator_sm100.hpp>   // TMEM allocator for SM100
 
 // Tutorial helpers
 #include "example_utils.hpp"
@@ -122,7 +123,9 @@ struct SharedStorage
   alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
   alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
 
-  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+  alignas(16) cute::uint64_t mma_barrier;   // Barrier to track MMA computation on SMEM
+
+  alignas(16) cute::uint32_t tmem_base_ptr; // Base pointer for TMEM allocation
 
   CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
@@ -225,6 +228,18 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
   Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
 
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+  TmemAllocator tmem_allocator{};
+
+  if (elect_one_warp) {
+    tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+  }
+  __syncthreads(); // Wait for all threads until warp0 allocates TMEM
+  tCtAcc.data() = shared_storage.tmem_base_ptr;
+
   if (thread0()) {
     print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
     print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
@@ -233,10 +248,8 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
     print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
   } __syncthreads();
 
-  // Barrier Initialization
-  uint32_t elect_one_thr  = cute::elect_one_sync();
-  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
 
+  // Barrier Initialization
   // Barriers in SMEM initialized by a single thread.
   if (elect_one_warp && elect_one_thr) {
     cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ 1);
@@ -306,6 +319,15 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   axpby(alpha, tDrAcc, beta, tDrC);
   // Store RMEM -> GMEM
   copy(tDrC, tDgD);
+
+  __syncthreads();
+
+  // Release the right to allocate before deallocations so that the next CTA can rasterize
+  // Then deallocate TMEM
+  if (elect_one_warp) {
+    tmem_allocator.release_allocation_lock();
+    tmem_allocator.free(shared_storage.tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+  }
 }
 
 template <class TypeA, class LayoutA,
diff --git a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
index e508e552..4ce2f4a8 100644
--- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
@@ -61,7 +61,8 @@
 #include <cute/tensor.hpp>                      // CuTe tensor implementation
 #include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
 #include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
-#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_copy.hpp>  // Auto vectorized copy operation
+#include <cute/arch/tmem_allocator_sm100.hpp>   // TMEM allocator for SM100
 
 // Tutorial helpers
 #include "example_utils.hpp"
@@ -124,6 +125,8 @@ struct SharedStorage
   alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
   alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
 
+  alignas(16) cute::uint32_t tmem_base_ptr; // Base pointer for TMEM allocation
+
   CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
 };
@@ -228,6 +231,18 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
   Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
 
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+  TmemAllocator tmem_allocator{};
+
+  if (elect_one_warp) {
+    tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+  }
+  __syncthreads(); // Wait for all threads until warp0 allocates TMEM
+  tCtAcc.data() = shared_storage.tmem_base_ptr;
+
   if (thread0()) {
     print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
     print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
@@ -269,9 +284,6 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   } __syncthreads();
 
   // Barrier Initialization
-  uint32_t elect_one_thr  = cute::elect_one_sync();
-  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
-
   // Barriers in SMEM initialized by a single thread.
   if (elect_one_warp && elect_one_thr) {
     cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ 1);
@@ -346,6 +358,15 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   axpby(alpha, tDrAcc, beta, tDrC);
   // Store RMEM -> GMEM
   copy(tDrC, tDgD);
+
+  __syncthreads();
+
+  // Release the right to allocate before deallocations so that the next CTA can rasterize
+  // Then deallocate TMEM
+  if (elect_one_warp) {
+    tmem_allocator.release_allocation_lock();
+    tmem_allocator.free(shared_storage.tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+  }
 }
 
 template <class TypeA, class LayoutA,
diff --git a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
index 1c2538e3..bc788bad 100644
--- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
+++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
@@ -61,7 +61,8 @@
 #include <cute/tensor.hpp>                      // CuTe tensor implementation
 #include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
 #include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
-#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_copy.hpp>  // Auto vectorized copy operation
+#include <cute/arch/tmem_allocator_sm100.hpp>   // TMEM allocator for SM100
 
 // Tutorial helpers
 #include "example_utils.hpp"
@@ -129,6 +130,8 @@ struct SharedStorage
   alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
   alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
 
+  alignas(16) cute::uint32_t tmem_base_ptr; // Base pointer for TMEM allocation
+
   CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
 };
@@ -231,6 +234,18 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
   Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
 
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+  TmemAllocator tmem_allocator{};
+
+  if (elect_one_warp) {
+    tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+  }
+  __syncthreads(); // Wait for all threads until warp0 allocates TMEM
+  tCtAcc.data() = shared_storage.tmem_base_ptr;
+
   if (thread0()) {
     print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
     print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
@@ -305,10 +320,6 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   } __syncthreads();
 
   // Barrier Initialization
-
-  uint32_t elect_one_thr  = cute::elect_one_sync();
-  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
-
   // Barriers in SMEM initialized by a single thread.
   if (elect_one_warp && elect_one_thr) {
     // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
@@ -385,6 +396,15 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   axpby(alpha, tDrAcc, beta, tDrC);
   // Store RMEM -> GMEM
   copy(tDrC, tDgD);
+
+  __syncthreads();
+
+  // Release the right to allocate before deallocations so that the next CTA can rasterize
+  // Then deallocate TMEM
+  if (elect_one_warp) {
+    tmem_allocator.release_allocation_lock();
+    tmem_allocator.free(shared_storage.tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+  }
 }
 
 template <class TypeA, class LayoutA,
diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
index 290436ea..9b17cd59 100644
--- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
+++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
@@ -61,7 +61,8 @@
 #include <cute/tensor.hpp>                      // CuTe tensor implementation
 #include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
 #include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
-#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_copy.hpp>  // Auto vectorized copy operation
+#include <cute/arch/tmem_allocator_sm100.hpp>   // TMEM allocator for SM100
 
 // Tutorial helpers
 #include "example_utils.hpp"
@@ -132,6 +133,8 @@ struct SharedStorage
   alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
   alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
 
+  alignas(16) cute::uint32_t tmem_base_ptr; // Base pointer for TMEM allocation
+
   CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
 };
@@ -234,6 +237,18 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
   Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
 
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  using TmemAllocator = cute::TMEM::Allocator2Sm;
+  TmemAllocator tmem_allocator{};
+
+  if (elect_one_warp) {
+    tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+  }
+  __syncthreads(); // Wait for all threads until warp0 allocates TMEM
+  tCtAcc.data() = shared_storage.tmem_base_ptr;
+
   if (thread0()) {
     print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
     print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
@@ -262,6 +277,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
 
   // Construct the CTA-in-Cluster coordinate for multicasting
   auto cta_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(int(cute::block_rank_in_cluster()));
+  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
 
   // Project the cluster_layout for tma_A along the N-modes
   auto [tAgA, tAsA] = tma_partition(tma_atom_A,
@@ -299,10 +315,6 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   } __syncthreads();
 
   // Barrier Initialization
-  auto elect_one_thr  = cute::elect_one_sync();
-  auto elect_one_warp = (threadIdx.x / 32 == 0);
-  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
-
   // Barriers in SMEM should be initialized by a single thread.
   if (elect_one_warp && elect_one_thr) {
     // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
@@ -386,6 +398,15 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   axpby(alpha, tDrAcc, beta, tDrC);
   // Store RMEM -> GMEM
   copy(tDrC, tDgD);
+
+  __syncthreads();
+
+  // Release the right to allocate before deallocations so that the next CTA can rasterize
+  // Then deallocate TMEM
+  if (elect_one_warp) {
+    tmem_allocator.release_allocation_lock();
+    tmem_allocator.free(shared_storage.tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+  }
 }
 
 template <class TypeA, class LayoutA,
diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
index 6d9ab03f..44b97587 100644
--- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
+++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
@@ -61,7 +61,8 @@
 #include <cute/tensor.hpp>                      // CuTe tensor implementation
 #include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
 #include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
-#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_copy.hpp>  // Auto vectorized copy operation
+#include <cute/arch/tmem_allocator_sm100.hpp>   // TMEM allocator for SM100
 
 // Tutorial helpers
 #include "example_utils.hpp"
@@ -140,6 +141,8 @@ struct SharedStorage
   alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
   alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
 
+  alignas(16) cute::uint32_t tmem_base_ptr; // Base pointer for TMEM allocation
+
   CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(tensors.mainloop.A.begin()), ASmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(tensors.mainloop.B.begin()), BSmemLayout{}); }
   CUTE_DEVICE constexpr auto tensor_sC() { return make_tensor(make_smem_ptr(tensors.C.begin()), CSmemLayout{}); }
@@ -247,6 +250,18 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
   Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
 
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  using TmemAllocator = cute::TMEM::Allocator2Sm;
+  TmemAllocator tmem_allocator{};
+
+  if (elect_one_warp) {
+    tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+  }
+  __syncthreads(); // Wait for all threads until warp0 allocates TMEM
+  tCtAcc.data() = shared_storage.tmem_base_ptr;
+
   if (thread0()) {
     print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
     print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
@@ -275,6 +290,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
 
   // Construct the CTA-in-Cluster coordinate for multicasting
   auto cta_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(int(cute::block_rank_in_cluster()));
+  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
 
   // Project the cluster_layout for tma_A along the N-modes
   auto [tAgA, tAsA] = tma_partition(tma_atom_A,
@@ -312,10 +328,6 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   } __syncthreads();
 
   // Barrier Initialization
-  auto elect_one_thr  = cute::elect_one_sync();
-  auto elect_one_warp = (threadIdx.x / 32 == 0);
-  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
-
   // Barriers in SMEM should be initialized by a single thread.
   if (elect_one_warp && elect_one_thr) {
     // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
@@ -441,6 +453,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
     }
     __syncthreads(); // All threads sync with issuing thread
   }
+  __syncthreads();
+
+  // Release the right to allocate before deallocations so that the next CTA can rasterize
+  // Then deallocate TMEM
+  if (elect_one_warp) {
+    tmem_allocator.release_allocation_lock();
+    tmem_allocator.free(shared_storage.tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+  }
 }
 
 template <class TypeA, class LayoutA,
diff --git a/include/cute/algorithm/tuple_algorithms.hpp b/include/cute/algorithm/tuple_algorithms.hpp
index 0ad8af52..311e1055 100644
--- a/include/cute/algorithm/tuple_algorithms.hpp
+++ b/include/cute/algorithm/tuple_algorithms.hpp
@@ -33,6 +33,7 @@
 #include <cute/config.hpp>
 
 #include <cute/util/type_traits.hpp>
+#include <cute/container/type_list.hpp>
 #include <cute/container/tuple.hpp>
 #include <cute/algorithm/functional.hpp>
 #include <cute/numeric/integer_sequence.hpp>
@@ -277,34 +278,13 @@ transform_leaf(T0 const& t0, T1 const& t1, F&& f)
 // find and find_if
 //
 
-namespace detail {
-
-template <class T, class F, int I, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-find_if(T const& t, F&& f, seq<I,Is...>)
-{
-  if constexpr (decltype(f(get<I>(t)))::value) {
-    return cute::C<I>{};
-  } else
-  if constexpr (sizeof...(Is) == 0) {
-    return cute::C<I+1>{};
-  } else {
-    return find_if(t, f, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
 template <class T, class F>
 CUTE_HOST_DEVICE constexpr
 auto
 find_if(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::find_if(t, f, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return cute::C<find_true_v<decltype(a)::value...>>{}; }, tuple_seq<T>{});
   } else {
     return cute::C<decltype(f(t))::value ? 0 : 1>{};
   }
@@ -326,7 +306,7 @@ auto
 any_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
   } else {
     return f(t);
   }
@@ -340,7 +320,7 @@ auto
 all_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
   } else {
     return f(t);
   }
diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp
index ba22ef1c..524a47ef 100644
--- a/include/cute/arch/cluster_sm90.hpp
+++ b/include/cute/arch/cluster_sm90.hpp
@@ -31,6 +31,7 @@
 #pragma once
 
 #include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
 
 // Config
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
index 91589538..2383b4e6 100644
--- a/include/cute/arch/config.hpp
+++ b/include/cute/arch/config.hpp
@@ -72,6 +72,27 @@
 #  define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
 #endif
 
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED))
+#  define CUTE_ARCH_TMA_SM90_ENABLED 
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#  define CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+#  define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  define CUTE_ARCH_TMA_SM90_ENABLED
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#endif
+
 #if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
 #endif
@@ -91,8 +112,11 @@
 #endif
 
 // {add, mul, fma}.f32x2 PTX
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
-  #define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#if defined(CUTLASS_ARCH_MMA_SM100_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+   // Enable CuTe MMA Atoms
+#  define CUTE_ARCH_FFMA2_SM100_ENABLED
+   // Enable f32x2 PTX generation
+#  define CUTE_ARCH_FLOAT2_MATH_ENABLED
 #endif
 
 #if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
@@ -109,3 +133,37 @@
 #  endif
 #endif
 
+#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#  define CUTE_ARCH_TCGEN05_TMEM_ENABLED
+#  define CUTE_ARCH_TMA_SM100_ENABLED
+#  define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) 
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#  define CUTE_ARCH_TCGEN05_TMEM_ENABLED
+#  define CUTE_ARCH_TMA_SM100_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM120F_ENABLED)
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+#    define CUTE_ARCH_LOAD256_SM100A_ENABLED
+#    define CUTE_ARCH_STORE256_SM100A_ENABLED
+#  endif
+#endif
+
+// {add, mul, fma}.f32x2 PTX
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+  #define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#endif
+
diff --git a/include/cute/arch/copy_sm100.hpp b/include/cute/arch/copy_sm100.hpp
index 19b13841..aa969afe 100644
--- a/include/cute/arch/copy_sm100.hpp
+++ b/include/cute/arch/copy_sm100.hpp
@@ -28,10 +28,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
-//
-
-//
 #pragma once
 
 #include <cute/arch/config.hpp>
@@ -316,17 +312,14 @@ struct SM100_U8x16_STSM_T
   }
 };
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cute
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // UTCCP PTX definitions
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace cute {
+namespace SM100::TMEM::UTCCP {
+
 // 128 data path lanes, 256-bit pattern, 1cta mode
 struct SM100_UTCCP_128dp256bit_1cta
 {
@@ -558,21 +551,19 @@ struct SM100_UTCCP_2x64dp128bitlw0123_2cta
   }
 };
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cute
+} // end namespace SM100::TMEM::UTCCP
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace cute {
+namespace SM100::TMEM::LOAD {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// TMEM_LOAD PTX definitions
+// TMEM LOAD PTX definitions
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -3945,7 +3936,6 @@ struct SM100_TMEM_LOAD_32dp32b128x
   }
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // 32 data path lanes, 32-bit pattern, repeated 128 times, packed 16b read
@@ -4065,9 +4055,21 @@ struct SM100_TMEM_LOAD_32dp32b128x_16b
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM100::TMEM::LOAD
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace SM100::TMEM::STORE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// TMEM_STORE PTX definitions
+// TMEM STORE PTX definitions
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -4086,8 +4088,8 @@ struct SM100_TMEM_STORE_16dp256b1x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x256b.x1.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4110,8 +4112,8 @@ struct SM100_TMEM_STORE_16dp256b1x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4136,8 +4138,8 @@ struct SM100_TMEM_STORE_16dp256b2x
     asm volatile ("tcgen05.st.sync.aligned.16x256b.x2.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -4163,8 +4165,8 @@ struct SM100_TMEM_STORE_16dp256b2x_16b
     asm volatile ("tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -4194,8 +4196,8 @@ struct SM100_TMEM_STORE_16dp256b4x
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4227,8 +4229,8 @@ struct SM100_TMEM_STORE_16dp256b4x_16b
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4268,8 +4270,8 @@ struct SM100_TMEM_STORE_16dp256b8x
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4313,8 +4315,8 @@ struct SM100_TMEM_STORE_16dp256b8x_16b
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4374,8 +4376,8 @@ struct SM100_TMEM_STORE_16dp256b16x
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4443,8 +4445,8 @@ struct SM100_TMEM_STORE_16dp256b16x_16b
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4544,8 +4546,8 @@ struct SM100_TMEM_STORE_16dp256b32x
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -4661,8 +4663,8 @@ struct SM100_TMEM_STORE_16dp256b32x_16b
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -4716,8 +4718,8 @@ struct SM100_TMEM_STORE_16dp128b1x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x1.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4740,8 +4742,8 @@ struct SM100_TMEM_STORE_16dp128b1x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4764,8 +4766,8 @@ struct SM100_TMEM_STORE_16dp128b2x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x2.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4788,8 +4790,8 @@ struct SM100_TMEM_STORE_16dp128b2x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -4814,8 +4816,8 @@ struct SM100_TMEM_STORE_16dp128b4x
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x4.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -4841,8 +4843,8 @@ struct SM100_TMEM_STORE_16dp128b4x_16b
     asm volatile ("tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -4872,8 +4874,8 @@ struct SM100_TMEM_STORE_16dp128b8x
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4905,8 +4907,8 @@ struct SM100_TMEM_STORE_16dp128b8x_16b
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4946,8 +4948,8 @@ struct SM100_TMEM_STORE_16dp128b16x
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -4991,8 +4993,8 @@ struct SM100_TMEM_STORE_16dp128b16x_16b
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5052,8 +5054,8 @@ struct SM100_TMEM_STORE_16dp128b32x
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5121,8 +5123,8 @@ struct SM100_TMEM_STORE_16dp128b32x_16b
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5222,8 +5224,8 @@ struct SM100_TMEM_STORE_16dp128b64x
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -5339,8 +5341,8 @@ struct SM100_TMEM_STORE_16dp128b64x_16b
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -5394,8 +5396,8 @@ struct SM100_TMEM_STORE_16dp64b1x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x1.b32"
                     "[%0],"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5418,8 +5420,8 @@ struct SM100_TMEM_STORE_16dp64b1x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32"
                     "[%0],"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5442,8 +5444,8 @@ struct SM100_TMEM_STORE_16dp64b2x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x2.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5466,8 +5468,8 @@ struct SM100_TMEM_STORE_16dp64b2x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5490,8 +5492,8 @@ struct SM100_TMEM_STORE_16dp64b4x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x4.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5514,8 +5516,8 @@ struct SM100_TMEM_STORE_16dp64b4x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -5540,8 +5542,8 @@ struct SM100_TMEM_STORE_16dp64b8x
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x8.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -5567,8 +5569,8 @@ struct SM100_TMEM_STORE_16dp64b8x_16b
     asm volatile ("tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -5598,8 +5600,8 @@ struct SM100_TMEM_STORE_16dp64b16x
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5631,8 +5633,8 @@ struct SM100_TMEM_STORE_16dp64b16x_16b
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5672,8 +5674,8 @@ struct SM100_TMEM_STORE_16dp64b32x
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5717,8 +5719,8 @@ struct SM100_TMEM_STORE_16dp64b32x_16b
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5778,8 +5780,8 @@ struct SM100_TMEM_STORE_16dp64b64x
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5847,8 +5849,8 @@ struct SM100_TMEM_STORE_16dp64b64x_16b
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -5948,8 +5950,8 @@ struct SM100_TMEM_STORE_16dp64b128x
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -6065,8 +6067,8 @@ struct SM100_TMEM_STORE_16dp64b128x_16b
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -6120,8 +6122,8 @@ struct SM100_TMEM_STORE_16dp32b1x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x1.b32"
                     "[%0] , 1,"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6144,8 +6146,8 @@ struct SM100_TMEM_STORE_16dp32b1x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32"
                     "[%0] , 2,"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6168,8 +6170,8 @@ struct SM100_TMEM_STORE_16dp32b2x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x2.b32"
                     "[%0] , 2,"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6192,8 +6194,8 @@ struct SM100_TMEM_STORE_16dp32b2x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32"
                     "[%0] , 4,"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6216,8 +6218,8 @@ struct SM100_TMEM_STORE_16dp32b4x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x4.b32"
                     "[%0] , 4,"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6240,8 +6242,8 @@ struct SM100_TMEM_STORE_16dp32b4x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32"
                     "[%0] , 8,"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6266,8 +6268,8 @@ struct SM100_TMEM_STORE_16dp32b8x
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x8.b32"
                     "[%0] , 8,"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -6293,8 +6295,8 @@ struct SM100_TMEM_STORE_16dp32b8x_16b
     asm volatile ("tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32"
                     "[%0] , 16,"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -6324,8 +6326,8 @@ struct SM100_TMEM_STORE_16dp32b16x
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6357,8 +6359,8 @@ struct SM100_TMEM_STORE_16dp32b16x_16b
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6398,8 +6400,8 @@ struct SM100_TMEM_STORE_16dp32b32x
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6443,8 +6445,8 @@ struct SM100_TMEM_STORE_16dp32b32x_16b
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6504,8 +6506,8 @@ struct SM100_TMEM_STORE_16dp32b64x
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6573,8 +6575,8 @@ struct SM100_TMEM_STORE_16dp32b64x_16b
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -6674,8 +6676,8 @@ struct SM100_TMEM_STORE_16dp32b128x
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -6791,8 +6793,8 @@ struct SM100_TMEM_STORE_16dp32b128x_16b
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -6846,8 +6848,8 @@ struct SM100_TMEM_STORE_32dp32b1x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x1.b32"
                     "[%0],"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6870,8 +6872,8 @@ struct SM100_TMEM_STORE_32dp32b1x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32"
                     "[%0],"
-                    "{%1};\n" 
-    :  
+                    "{%1};\n"
+    :
     :  "r"(dst_addr), "r"(src0) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6894,8 +6896,8 @@ struct SM100_TMEM_STORE_32dp32b2x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x2.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6918,8 +6920,8 @@ struct SM100_TMEM_STORE_32dp32b2x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2};\n" 
-    :  
+                    "{%1, %2};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6942,8 +6944,8 @@ struct SM100_TMEM_STORE_32dp32b4x
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x4.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6966,8 +6968,8 @@ struct SM100_TMEM_STORE_32dp32b4x_16b
 #if defined(CUTE_ARCH_TCGEN05_TMEM_ENABLED)
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32"
                     "[%0],"
-                    "{%1, %2, %3, %4};\n" 
-    :  
+                    "{%1, %2, %3, %4};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3) );
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use TMEM_STORE without CUTE_ARCH_TCGEN05_TMEM_ENABLED.");
@@ -6992,8 +6994,8 @@ struct SM100_TMEM_STORE_32dp32b8x
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x8.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -7019,8 +7021,8 @@ struct SM100_TMEM_STORE_32dp32b8x_16b
     asm volatile ("tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32"
                     "[%0],"
                     "{%1, %2, %3, %4,"
-                    "%5, %6, %7, %8};\n" 
-    :  
+                    "%5, %6, %7, %8};\n"
+    :
     :  "r"(dst_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3),
        "r"(src4), "r"(src5), "r"(src6), "r"(src7) );
 #else
@@ -7050,8 +7052,8 @@ struct SM100_TMEM_STORE_32dp32b16x
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7083,8 +7085,8 @@ struct SM100_TMEM_STORE_32dp32b16x_16b
                     "{%1, %2, %3, %4,"
                     "%5, %6, %7, %8,"
                     "%9, %10, %11, %12,"
-                    "%13, %14, %15, %16};\n" 
-    :  
+                    "%13, %14, %15, %16};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7124,8 +7126,8 @@ struct SM100_TMEM_STORE_32dp32b32x
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7169,8 +7171,8 @@ struct SM100_TMEM_STORE_32dp32b32x_16b
                     "%17, %18, %19, %20,"
                     "%21, %22, %23, %24,"
                     "%25, %26, %27, %28,"
-                    "%29, %30, %31, %32};\n" 
-    :  
+                    "%29, %30, %31, %32};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7230,8 +7232,8 @@ struct SM100_TMEM_STORE_32dp32b64x
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7299,8 +7301,8 @@ struct SM100_TMEM_STORE_32dp32b64x_16b
                     "%49, %50, %51, %52,"
                     "%53, %54, %55, %56,"
                     "%57, %58, %59, %60,"
-                    "%61, %62, %63, %64};\n" 
-    :  
+                    "%61, %62, %63, %64};\n"
+    :
     :  "r"(dst_addr), "r"(src00), "r"(src01), "r"(src02), "r"(src03),
        "r"(src04), "r"(src05), "r"(src06), "r"(src07),
        "r"(src08), "r"(src09), "r"(src10), "r"(src11),
@@ -7400,8 +7402,8 @@ struct SM100_TMEM_STORE_32dp32b128x
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -7517,8 +7519,8 @@ struct SM100_TMEM_STORE_32dp32b128x_16b
                     "%113, %114, %115, %116,"
                     "%117, %118, %119, %120,"
                     "%121, %122, %123, %124,"
-                    "%125, %126, %127, %128};\n" 
-    :  
+                    "%125, %126, %127, %128};\n"
+    :
     :  "r"(dst_addr), "r"(src000), "r"(src001), "r"(src002), "r"(src003),
        "r"(src004), "r"(src005), "r"(src006), "r"(src007),
        "r"(src008), "r"(src009), "r"(src010), "r"(src011),
@@ -7561,7 +7563,8 @@ struct SM100_TMEM_STORE_32dp32b128x_16b
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace cute
+} // namespace SM100::TMEM::STORE
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+} // end namespace cute
diff --git a/include/cute/arch/mma_sm100.hpp b/include/cute/arch/mma_sm100.hpp
index 2fa532d2..749da816 100644
--- a/include/cute/arch/mma_sm100.hpp
+++ b/include/cute/arch/mma_sm100.hpp
@@ -29,7 +29,6 @@
  *
  **************************************************************************************************/
 //
-
 //
 
 #pragma once
@@ -37,6 +36,48 @@
 #include <cute/arch/config.hpp>
 #include <cute/arch/mma.hpp>
 
+#include <cute/arch/simd_sm100.hpp>
+
 namespace cute {
 
+struct SM100_2x1x1_F32F32F32F32 {
+  using DRegisters = float2[1];
+  using ARegisters = float2[1];
+  using BRegisters = float[1];
+  using CRegisters = float2[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float2       &  d01,
+      float2  const&  a01,
+      float   const&  b0,
+      float2  const&  c01)
+  {
+#if defined(CUTE_ARCH_FFMA2_SM100_ENABLED)
+  cute::fma(d01, a01, make_float2(b0, b0), c01);
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_2x1x1_F32F32F32F32 without CUTE_ARCH_FLOAT2_MATH_ENABLED");
+#endif
+  }
+};
+
+struct SM100_1x2x1_F32F32F32F32 {
+  using DRegisters = float2[1];
+  using ARegisters = float[1];
+  using BRegisters = float2[1];
+  using CRegisters = float2[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float2       &  d01,
+      float   const&  a0,
+      float2  const&  b01,
+      float2  const&  c01)
+  {
+#if defined(CUTE_ARCH_FFMA2_SM100_ENABLED)
+  cute::fma(d01, make_float2(a0, a0), b01, c01);
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_1x2x1_F32F32F32F32 without CUTE_ARCH_FFMA2_SM100_ENABLED");
+#endif
+  }
+};
+
 } // namespace cute
diff --git a/include/cute/arch/tmem_allocator_sm100.hpp b/include/cute/arch/tmem_allocator_sm100.hpp
index 9839e740..680e237f 100644
--- a/include/cute/arch/tmem_allocator_sm100.hpp
+++ b/include/cute/arch/tmem_allocator_sm100.hpp
@@ -28,19 +28,34 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-//
 
-//
 #pragma once
 
 #include <cute/arch/config.hpp>
-#include <cute/arch/cluster_sm90.hpp>
-#include <cute/atom/copy_traits_sm100.hpp>
-
-#include <cutlass/pipeline/sm90_pipeline.hpp>
+#include <cute/arch/util.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/pointer.hpp>
 
 namespace cute::TMEM {
 
+//
+// TMEM Addressing Constants
+//
+
+// 128 DP x 512 COL x uint32_t-addressing
+using MAX_CAPACITY_BITS = Int<128*512*32>;
+
+// TMEM DP stride in bit-addressing (shift by 5 for conversion from uint32_t)
+using DP_b = cute::constant<int32_t, (1 << 21)>;
+
+// TMEM DP stride in type-T addressing
+template <class T = uint32_t>
+using DP = cute::constant<int32_t, shiftl((1 << 16), tmem_ptr<T>::OffsetShift)>;
+
+//
+// TMEM Allocators
+//
+
 // All operations of this class require that only a single warp uniformly participates
 class Allocator1Sm {
 public:
@@ -77,8 +92,8 @@ public:
     asm volatile(
       "{\n\t"
       "tcgen05.dealloc.cta_group::1.sync.aligned.b32  %0, %1; \n\t"
-      "}" 
-      : 
+      "}"
+      :
       : "r"(tmem_ptr), "r"(num_columns));
   #else
     CUTE_INVALID_CONTROL_PATH("Attempting to use TMEM allocation PTX without CUTE_ARCH_TCGEN05_TMEM_ENABLED");
@@ -130,7 +145,7 @@ public:
   }
 
   /**
-   * Frees the TMEM corresponding to the pointer and slice count provided. 
+   * Frees the TMEM corresponding to the pointer and slice count provided.
    * Release the TMEM after checking that the CTA issuing the free does indeed own the corresponding slices.
    * @param tmem_ptr Base address of the TMEM address space being freed.
    * @param num_columns Number of columns being freed. Must be 32 <= num_columns <= 512 and power of 2.
@@ -146,8 +161,8 @@ public:
     asm volatile(
       "{\n\t"
       "tcgen05.dealloc.cta_group::2.sync.aligned.b32  %0, %1; \n\t"
-      "}" 
-      : 
+      "}"
+      :
       : "r"(tmem_ptr), "r"(num_columns));
   #else
     CUTE_INVALID_CONTROL_PATH("Attempting to use TMEM allocation PTX without CUTE_ARCH_TCGEN05_TMEM_ENABLED");
diff --git a/include/cute/atom/copy_traits_sm100.hpp b/include/cute/atom/copy_traits_sm100.hpp
index 6a767ae3..594149d4 100644
--- a/include/cute/atom/copy_traits_sm100.hpp
+++ b/include/cute/atom/copy_traits_sm100.hpp
@@ -28,13 +28,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-//
-
-//
 
 #pragma once
 
 #include <cute/arch/copy_sm100.hpp>
+#include <cute/arch/tmem_allocator_sm100.hpp>
 
 #include <cute/atom/copy_traits.hpp>
 #include <cute/atom/copy_atom.hpp>
@@ -230,92 +228,11 @@ struct Copy_Traits<SM100_U8x16_STSM_T>
   using RefLayout = SrcLayout;
 };
 
-namespace TMEM {
-  using MAX_CAPACITY_BITS = Int<128*512*32>;         // 128 DP x 512 COL x uint32_t-addressing
-
-  template <class T = uint32_t>                      // TMEM DP  stride in type-T addressing
-  using DP  = cute::constant<int32_t, shiftl((1 << 16), tmem_ptr<T>::OffsetShift)>;
-
-  using DP_b  = cute::constant<int32_t, (1 << 21)>;  // TMEM DP  stride in bit-addressing (shift by 5 for conversion from uint32_t)
-}
-
-// TMEM_LOAD copy_unpack
-template <class CopyOp>
-struct TMEM_LOAD_Unpack
-{
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits<CopyOp> const& traits,
-              Tensor<TS,SLayout>  const& src,
-              Tensor<TD,DLayout>       & dst)
-  {
-    static_assert(is_tmem<TS>::value, "Expected TMEM src.");
-    static_assert(is_rmem<TD>::value, "Expected RMEM dst.");
-
-    using SrcType = typename TS::value_type;
-    CUTE_STATIC_ASSERT_V((coalesce(layout(src)) == coalesce(upcast<sizeof_bits<SrcType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
-      "Expected src to have the specific TMEM layout required by CopyOp.");
-
-    uint32_t tmem_addr = raw_pointer_cast(src.data());
-
-    using RegTypeDst = typename remove_extent<typename CopyOp::DRegisters>::type;
-    Tensor rD = recast<RegTypeDst>(dst);
-
-    constexpr int RegNumDst = extent<typename CopyOp::DRegisters>::value;
-    CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
-      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this CopyOp.");
-
-    // thread idx <=> DP lane assert.
-    // ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
-#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
-    assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
-#endif
-
-    detail::explode(CopyOp::copy,
-                    &tmem_addr, seq<0>{},
-                    rD, make_seq<RegNumDst>{});
-  }
-};
-
-// TMEM_STORE copy_unpack
-template <class CopyOp>
-struct TMEM_STORE_Unpack
-{
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits<CopyOp> const& traits,
-              Tensor<TS,SLayout>  const& src,
-              Tensor<TD,DLayout>       & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected RMEM src.");
-    static_assert(is_tmem<TD>::value, "Expected TMEM dst.");
-
-    using RegTypeSrc = typename remove_extent<typename CopyOp::SRegisters>::type;
-    Tensor rS = recast<RegTypeSrc>(src);
-
-    constexpr int RegNumSrc = extent<typename CopyOp::SRegisters>::value;
-    CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
-      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
-
-    using DstType = typename TD::value_type;
-    CUTE_STATIC_ASSERT_V((coalesce(layout(dst)) == coalesce(upcast<sizeof_bits<DstType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
-      "Expected dst to have the specific TMEM layout required by CopyOp.");
-
-    uint32_t tmem_addr = raw_pointer_cast(dst.data());
-
-    // thread idx <=> DP lane assert.
-    // ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
-#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
-    assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
-#endif
-
-    detail::explode(CopyOp::copy,
-                    rS, make_seq<RegNumSrc>{},
-                    &tmem_addr, seq<0>{});
-  }
-};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMEM Traits and Utilities
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <class... Args>
 struct Copy_Atom;
@@ -418,10 +335,2406 @@ make_tmem_warp_partitioner(Tensor<TEngine,TLayout> const& tmem)
   return make_tiler_impl(layout_tv, tiler);
 }
 
-} // end namespace cute
+namespace SM100::TMEM::LOAD {
+
+//
+// Specialized copy_unpack implementation for SM100::TMEM::LOAD instructions
+//
+
+template <class CopyOp,
+          class TS, class SLayout,
+          class TD, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(Copy_Traits<CopyOp> const& traits,
+            Tensor<TS,SLayout>  const& src,
+            Tensor<TD,DLayout>       & dst)
+{
+  static_assert(is_tmem<TS>::value, "Expected TMEM src.");
+  static_assert(is_rmem<TD>::value, "Expected RMEM dst.");
+
+  using SrcType = typename TS::value_type;
+  CUTE_STATIC_ASSERT_V((coalesce(layout(src)) == coalesce(upcast<sizeof_bits<SrcType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
+    "Expected src to have the specific TMEM layout required by CopyOp.");
+
+  uint32_t tmem_addr = raw_pointer_cast(src.data());
+
+  using RegTypeDst = typename remove_extent<typename CopyOp::DRegisters>::type;
+  Tensor rD = recast<RegTypeDst>(dst);
+
+  constexpr int RegNumDst = extent<typename CopyOp::DRegisters>::value;
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
+    "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this CopyOp.");
+
+  // thread idx <=> DP lane assert.
+  // ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
+#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
+  assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
+#endif
+
+  detail::explode(CopyOp::copy,
+                  &tmem_addr, seq<0>{},
+                  rD, make_seq<RegNumDst>{});
+}
+
+} // end namespace SM100::TMEM::LOAD
+
+namespace SM100::TMEM::STORE {
+
+//
+// Specialized copy_unpack implementation for SM100::TMEM::STORE instructions
+//
+
+template <class CopyOp,
+          class TS, class SLayout,
+          class TD, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(Copy_Traits<CopyOp> const& traits,
+            Tensor<TS,SLayout>  const& src,
+            Tensor<TD,DLayout>       & dst)
+{
+  static_assert(is_rmem<TS>::value, "Expected RMEM src.");
+  static_assert(is_tmem<TD>::value, "Expected TMEM dst.");
+
+  using RegTypeSrc = typename remove_extent<typename CopyOp::SRegisters>::type;
+  Tensor rS = recast<RegTypeSrc>(src);
+
+  constexpr int RegNumSrc = extent<typename CopyOp::SRegisters>::value;
+  CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
+    "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+
+  using DstType = typename TD::value_type;
+  CUTE_STATIC_ASSERT_V((coalesce(layout(dst)) == coalesce(upcast<sizeof_bits<DstType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
+    "Expected dst to have the specific TMEM layout required by CopyOp.");
+
+  uint32_t tmem_addr = raw_pointer_cast(dst.data());
+
+  // thread idx <=> DP lane assert.
+  // ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
+#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
+  assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
+#endif
+
+  detail::explode(CopyOp::copy,
+                  rS, make_seq<RegNumSrc>{},
+                  &tmem_addr, seq<0>{});
+}
+
+} // end namespace SM100::TMEM::STORE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMEM_LOAD Copy Traits
+//
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace cute {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+  // Logical bit id to bit idx (address)
+  using ValID = Layout<Shape <_256,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2>>,
+                           Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2>>,
+                           Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_512,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2,  _2>>,
+                           Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2,  _2>>,
+                           Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_1024,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,   _2,  _4>>,
+                           Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,   _2,  _4>>,
+                           Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_2048,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2,  _8>>,
+                           Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2,  _8>>,
+                           Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_4096,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _16>>,
+                           Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _16>>,
+                           Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_8192,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _32>>,
+                           Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _32>>,
+                           Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_128,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2>>,
+                           Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2>>,
+                           Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_256,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _2>>,
+                           Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _2>>,
+                           Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_512,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _4>>,
+                           Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _4>>,
+                           Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_1024,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,   _2,  _8>>,
+                           Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,   _2,  _8>>,
+                           Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_2048,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _16>>,
+                           Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _16>>,
+                           Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_4096,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _32>>,
+                           Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _32>>,
+                           Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_8192,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _64>>,
+                           Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _64>>,
+                           Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_64,       _16>,
+                       Stride< _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <  _2, _2, _8>,_32>,
+                           Stride<Stride<_512,_32,_64>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _4>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <  _2, _2, _8>,_32>,
+                           Stride<Stride<_512,_32,_64>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_128,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _2>>,
+                           Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _2>>,
+                           Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_256,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _4>>,
+                           Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _4>>,
+                           Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_512,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _8>>,
+                           Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _8>>,
+                           Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_1024,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,   _8>,Shape <_32,_16>>,
+                           Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <   _2, _2,   _8>,Shape <_32,_16>>,
+                           Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_2048,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_32>>,
+                           Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_32>>,
+                           Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_4096,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_64>>,
+                           Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_64>>,
+                           Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_8192,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_128>>,
+                           Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_128>>,
+                           Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_64,       _16>,
+                       Stride< _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
+                           Stride<Stride<_64,_32>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _4>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
+                           Stride<Stride<_64,_32>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_128,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
+                           Stride<Stride<_128,_64>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
+                           Stride<Stride<_128,_64>, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_256,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16,  _2>,_128>,
+                           Stride<Stride<_256,_128>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16,  _2>,_128>,
+                           Stride<Stride<_256,_128>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_512,       _16>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16,  _2>,_256>,
+                           Stride<Stride<_512,_256>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <Shape < _16,  _2>,_256>,
+                           Stride<Stride<_512,_256>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_1024,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,  _2>,_512>,
+                           Stride<Stride<_1024,_512>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,  _2>,_512>,
+                           Stride<Stride<_1024,_512>,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_2048,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_1024>,
+                           Stride<Stride<_2048,_1024>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_1024>,
+                           Stride<Stride<_2048,_1024>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_4096,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_2048>,
+                           Stride<Stride<_4096,_2048>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_2048>,
+                           Stride<Stride<_4096,_2048>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_8192,       _16>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_4096>,
+                           Stride<Stride<_8192,_4096>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_4096>,
+                           Stride<Stride<_8192,_4096>,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_32,       _32>,
+                       Stride< _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <_32,_32>,
+                           Stride<_32, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _2>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_1024>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <_32,_32>,
+                           Stride<_32, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_64,       _32>,
+                       Stride< _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _4>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_2048>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_128,       _32>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16, _8>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_4096>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_256,       _32>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape < _32,_256>,
+                           Stride<_256,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_16>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_8192>,
+                           Stride< _0,   _1>>;
+  using DstLayout = Layout<Shape < _32,_256>,
+                           Stride<_256,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_512,       _32>,
+                       Stride<  _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape < _32,_512>,
+                           Stride<_512,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_32>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_16384>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape < _32,_512>,
+                           Stride<_512,  _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_1024,       _32>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <  _32,_1024>,
+                           Stride<_1024,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_64>,       _32>,
+                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_32768>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <  _32,_1024>,
+                           Stride<_1024,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_2048,       _32>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <  _32,_2048>,
+                           Stride<_2048,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_128>,       _32>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_65536>,
+                           Stride< _0,    _1>>;
+  using DstLayout = Layout<Shape <  _32,_2048>,
+                           Stride<_2048,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <_4096,       _32>,
+                       Stride<   _1,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <  _32,_4096>,
+                           Stride<_4096,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>
+{
+  using ThrID = Layout<_32>;
+  using ValID = Layout<Shape <Shape <_16,_256>,       _32>,
+                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
+  using SrcLayout = Layout<Shape <_32,_131072>,
+                           Stride< _0,     _1>>;
+  using DstLayout = Layout<Shape <  _32,_4096>,
+                           Stride<_4096,   _1>>;
+  using RefLayout = SrcLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMEM_STORE Copy Traits
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b1x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b1x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b2x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b2x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b4x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b4x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b8x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b8x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b16x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b16x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b32x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b32x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b64x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b64x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b128x;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::RefLayout;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b128x_16b;
+
+template <>
+struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x_16b>
+{
+  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ThrID;
+  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ValID;
+  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::DstLayout;
+  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::SrcLayout;
+  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::RefLayout;
+};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1159,2183 +3472,38 @@ tmem_load_to_store(CopyOp) {
   }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace TMEM
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// TMEM_LOAD Copy Traits
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b1x>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-  // Logical bit id to bit idx (address)
-  using ValID = Layout<Shape <_256,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2>>,
-                           Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b1x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2>>,
-                           Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b2x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_512,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2,  _2>>,
-                           Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b2x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_64,   _2,  _2>>,
-                           Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b4x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_1024,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,   _2,  _4>>,
-                           Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b4x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,   _2,  _4>>,
-                           Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b8x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_2048,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2,  _8>>,
-                           Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b8x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2,  _8>>,
-                           Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b16x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_4096,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _16>>,
-                           Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b16x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _16>>,
-                           Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b32x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_8192,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _32>>,
-                           Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp256b32x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_64,    _2, _32>>,
-                           Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b1x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_128,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2>>,
-                           Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b1x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2>>,
-                           Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b2x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_256,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _2>>,
-                           Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b2x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _2>>,
-                           Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b4x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_512,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _4>>,
-                           Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b4x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,  _8>,Shape <_32,   _2,  _4>>,
-                           Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b8x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_1024,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,   _2,  _8>>,
-                           Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b8x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,   _2,  _8>>,
-                           Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b16x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_2048,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _16>>,
-                           Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b16x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _16>>,
-                           Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b32x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_4096,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _32>>,
-                           Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b32x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _32>>,
-                           Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b64x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_8192,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _64>>,
-                           Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp128b64x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape < _4,   _8>,Shape <_32,    _2, _64>>,
-                           Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b1x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_64,       _16>,
-                       Stride< _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <  _2, _2, _8>,_32>,
-                           Stride<Stride<_512,_32,_64>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b1x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _4>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <  _2, _2, _8>,_32>,
-                           Stride<Stride<_512,_32,_64>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b2x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_128,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _2>>,
-                           Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b2x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _2>>,
-                           Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b4x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_256,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _4>>,
-                           Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b4x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _4>>,
-                           Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b8x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_512,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _8>>,
-                           Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b8x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,  _8>,Shape <_32, _8>>,
-                           Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b16x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_1024,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,   _8>,Shape <_32,_16>>,
-                           Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b16x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <   _2, _2,   _8>,Shape <_32,_16>>,
-                           Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b32x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_2048,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_32>>,
-                           Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b32x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_32>>,
-                           Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b64x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_4096,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_64>>,
-                           Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b64x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_64>>,
-                           Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b128x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_8192,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_128>>,
-                           Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp64b128x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape <    _2, _2,   _8>,Shape <_32,_128>>,
-                           Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b1x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_64,       _16>,
-                       Stride< _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
-                           Stride<Stride<_64,_32>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b1x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _4>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
-                           Stride<Stride<_64,_32>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b2x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_128,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
-                           Stride<Stride<_128,_64>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b2x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _8>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
-                           Stride<Stride<_128,_64>, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b4x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_256,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16,  _2>,_128>,
-                           Stride<Stride<_256,_128>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b4x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_16>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16,  _2>,_128>,
-                           Stride<Stride<_256,_128>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b8x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_512,       _16>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16,  _2>,_256>,
-                           Stride<Stride<_512,_256>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b8x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_32>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <Shape < _16,  _2>,_256>,
-                           Stride<Stride<_512,_256>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b16x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_1024,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,  _2>,_512>,
-                           Stride<Stride<_1024,_512>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b16x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_64>,       _16>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,  _2>,_512>,
-                           Stride<Stride<_1024,_512>,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b32x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_2048,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_1024>,
-                           Stride<Stride<_2048,_1024>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b32x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_128>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_1024>,
-                           Stride<Stride<_2048,_1024>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b64x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_4096,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_2048>,
-                           Stride<Stride<_4096,_2048>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b64x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_256>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_2048>,
-                           Stride<Stride<_4096,_2048>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b128x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_8192,       _16>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_4096>,
-                           Stride<Stride<_8192,_4096>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_16dp32b128x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_512>,       _16>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <Shape <  _16,   _2>,_4096>,
-                           Stride<Stride<_8192,_4096>,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b1x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_32,       _32>,
-                       Stride< _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <_32,_32>,
-                           Stride<_32, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b1x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _2>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_1024>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <_32,_32>,
-                           Stride<_32, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b2x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_64,       _32>,
-                       Stride< _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <_32,_64>,
-                           Stride<_64, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b2x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _4>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_2048>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape <_32,_64>,
-                           Stride<_64, _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b4x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_128,       _32>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape < _32,_128>,
-                           Stride<_128,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b4x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16, _8>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_4096>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape < _32,_128>,
-                           Stride<_128,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b8x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_256,       _32>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape < _32,_256>,
-                           Stride<_256,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b8x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_16>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_8192>,
-                           Stride< _0,   _1>>;
-  using DstLayout = Layout<Shape < _32,_256>,
-                           Stride<_256,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b16x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_512,       _32>,
-                       Stride<  _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape < _32,_512>,
-                           Stride<_512,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b16x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_32>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_16384>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape < _32,_512>,
-                           Stride<_512,  _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b32x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_1024,       _32>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <  _32,_1024>,
-                           Stride<_1024,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b32x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_64>,       _32>,
-                       Stride<Stride< _1,_32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_32768>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <  _32,_1024>,
-                           Stride<_1024,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b64x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_2048,       _32>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <  _32,_2048>,
-                           Stride<_2048,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b64x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_128>,       _32>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_65536>,
-                           Stride< _0,    _1>>;
-  using DstLayout = Layout<Shape <  _32,_2048>,
-                           Stride<_2048,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b128x>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <_4096,       _32>,
-                       Stride<   _1,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <  _32,_4096>,
-                           Stride<_4096,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template <>
-struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>
-     : TMEM_LOAD_Unpack<SM100_TMEM_LOAD_32dp32b128x_16b>
-{
-  using ThrID = Layout<_32>;
-  using ValID = Layout<Shape <Shape <_16,_256>,       _32>,
-                       Stride<Stride< _1, _32>,TMEM::DP_b>>;
-  using SrcLayout = Layout<Shape <_32,_131072>,
-                           Stride< _0,     _1>>;
-  using DstLayout = Layout<Shape <  _32,_4096>,
-                           Stride<_4096,   _1>>;
-  using RefLayout = SrcLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// TMEM_STORE Copy Traits
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b1x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b1x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b2x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b2x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b4x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b4x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b8x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b8x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b16x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b16x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b32x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp256b32x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b1x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b1x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b2x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b2x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b4x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b4x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b8x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b8x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b16x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b16x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b32x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b32x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b64x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp128b64x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b1x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b1x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b2x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b2x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b4x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b4x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b8x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b8x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b16x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b16x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b32x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b32x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b64x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b64x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b128x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp64b128x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b1x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b1x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b2x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b2x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b4x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b4x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b8x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b8x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b16x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b16x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b32x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b32x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b64x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b64x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b128x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_16dp32b128x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b1x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b1x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b2x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b2x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b4x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b4x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b8x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b8x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b16x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b16x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b32x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b32x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b64x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b64x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b128x>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x_16b>
-     : TMEM_STORE_Unpack<SM100_TMEM_STORE_32dp32b128x_16b>
-{
-  using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ThrID;
-  using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ValID;
-  using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::DstLayout;
-  using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::SrcLayout;
-  using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::RefLayout;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // UTCCP Copy Traits
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace SM100::TMEM::UTCCP {
+
+//
+// Specialized copy_unpack implementation for SM100::TMEM::UTCCP instructions
+//
+
+template <class CopyOp,
+          class TS, class SLayout,
+          class TD, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(Copy_Traits<CopyOp> const&,
+            Tensor<TS,SLayout>  const& src,
+            Tensor<TD,DLayout>       & dst)
+{
+  static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
+  static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
+  CopyOp::copy(src[0], raw_pointer_cast(dst.data()));
+}
+
+} // end namespace SM100::TMEM::UTCCP
+
 // In the following UTCCP traits, the ValID is representing:
 // logical_bit_idx -> tmem_addr_offset.
 // And the logical_bit_idx is numbered in the order of:
@@ -3344,132 +3512,77 @@ struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x_16b>
 // The last two modes provide boradcast transformation for 4x32DP and 2x64DP.
 // With above, the strides of first two modes are neccessary to be TMEM::DP_b and 1.
 // And the stride of the third mode in the SrcLayout must be zero.
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_128dp256bit_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_128dp256bit_1cta>
 {
   using ThrID = Layout<_1>;
-  // logical bit_idx -> tmem_addr
   using ValID = Layout<Shape <_128,      _256>,
                        Stride<TMEM::DP_b, _1>>;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape<_1, _32768>,
                            Stride<_0, _1>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_1, _32768>,
                            Stride<_0,_1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_128dp256bit_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_128dp256bit_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_128dp256bit_2cta>
 {
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_128dp256bit_1cta>::ValID;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2, _32768>,
                            Stride<_0, _1>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_2, _32768>,
                            Stride<_0, _1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_128dp256bit_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_128dp128bit_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_128dp128bit_1cta>
 {
   using ThrID = Layout<_1>;
-  // logical bit_idx -> tmem_addr
   using ValID = Layout<Shape <_128,      _128>,
                        Stride<TMEM::DP_b, _1>>;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape<_1, _16384>,
                            Stride<_0, _1>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_1, _16384>,
                            Stride<_0,_1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_128dp128bit_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_128dp128bit_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_128dp128bit_2cta>
 {
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_128dp128bit_1cta>::ValID;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2, _16384>,
                            Stride<_0, _1>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_2, _16384>,
                            Stride<_0, _1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_128dp128bit_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_4dp256bit_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_4dp256bit_1cta>
 {
@@ -3485,66 +3598,35 @@ struct Copy_Traits<SM100_UTCCP_4dp256bit_1cta>
   */
 
   using ThrID = Layout<_1>;
-  // logical bit_idx -> tmem_addr
   using ValID = Layout<Shape <_128,    _256>,
                        Stride<TMEM::DP_b,_1>>;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_1,Shape <_4, _256>>,
                            Stride<_0,Stride<_32,_128>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_1,Shape <_4, _256>>,
                            Stride<_0,Stride<_32,_128>>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_4dp256bit_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_4dp256bit_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_4dp256bit_2cta>
 {
-
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_4dp256bit_1cta>::ValID;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2,Shape <_4, _256>>,
                            Stride<_0,Stride<_32,_128>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_2,Shape <_4, _256>>,
                            Stride<_0,Stride<_32,_128>>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_4dp256bit_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_4x32dp128bit_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_4x32dp128bit_1cta>
 {
@@ -3556,64 +3638,33 @@ struct Copy_Traits<SM100_UTCCP_4x32dp128bit_1cta>
   // [core_matrix_strided, core_matrix_leading, broadcast]
   using ValID = Layout<Shape <_32,_128,_4>,
                        Stride<_DP,_1,  _DPx32>>;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_1,Shape <_32,_128,_4>>,
                            Stride<_0,Stride<_1, _32, _0>>>;
-
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_1,_16384>,
                            Stride<_0,_1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_4x32dp128bit_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_4x32dp128bit_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_4x32dp128bit_2cta>
 {
-
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_4x32dp128bit_1cta>::ValID;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2,Shape <_32,_128,_4>>,
                            Stride<_0,Stride<_1, _32, _0>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_2, _16384>,
                            Stride<_0,_1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_4x32dp128bit_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0213_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_1cta>
 {
@@ -3625,63 +3676,34 @@ struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_1cta>
   // [core_matrix_strided, core_matrix_leading, broadcast]
   using ValID = Layout<Shape <_64,_128,_2>,
                        Stride<_DP,_1,  _DPx64>>;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_1,Shape <_64,_128,_2>>,
                            Stride<_0,Stride<_1, _64, _0>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_1, _16384>,
                            Stride<_0, _1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_2x64dp128bitlw0213_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0213_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_2cta>
 {
-
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_1cta>::ValID;
 
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2,Shape <_64,_128,_2>>,
                            Stride<_0,Stride<_1, _64, _0>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_2, _16384>,
                            Stride<_0, _1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_2x64dp128bitlw0213_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0123_1cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_1cta>
 {
@@ -3695,62 +3717,31 @@ struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_1cta>
   using ValID = Layout<Shape <_32,_128,_2,    _2>,
                        Stride<_DP,_1  ,_DPx64,_DPx32>>;
 
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_1,Shape <_32,_128,_2,_2>>,
                            Stride<_0,Stride<_1, _32,_4096,_0>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape<_1, _16384>,
                            Stride<_0, _1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_2x64dp128bitlw0123_1cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0123_2cta;
+
 template <>
 struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_2cta>
 {
-
   using ThrID = Layout<_2>;
-  // logical bit_idx -> tmem_addr
   using ValID = typename Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_1cta>::ValID;
-
-  // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_2,Shape <_32,_128,_2,_2>>,
                            Stride<_0,Stride<_1, _32, _4096,_0>>>;
-  // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_2,_16384>,
                            Stride<_0,_1>>;
-  // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
-
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
-    static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
-    SM100_UTCCP_2x64dp128bitlw0123_2cta::copy(src[0], raw_pointer_cast(dst.data()));
-  }
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <class CopyOp,
           class TEngine, class TLayout>
 CUTE_HOST_DEVICE constexpr
@@ -3775,4 +3766,3 @@ make_utccp_copy(CopyOp const&,
 
 } // namespace cute
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/atom/copy_traits_sm90_im2col.hpp b/include/cute/atom/copy_traits_sm90_im2col.hpp
index beefa63f..e4d1e3ff 100644
--- a/include/cute/atom/copy_traits_sm90_im2col.hpp
+++ b/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -647,7 +647,7 @@ make_tma_atom_im2col(CopyOp,
       gtensor_cwhdn,
       range_c,
       range_whdn,
-      detail::get_swizzle_portion(slayout),
+      get_swizzle_portion(slayout),
       tma_layout_vt,
       lower_corner_whd,
       upper_corner_whd,
diff --git a/include/cute/atom/mma_traits_sm100.hpp b/include/cute/atom/mma_traits_sm100.hpp
index f336eff2..820dc103 100644
--- a/include/cute/atom/mma_traits_sm100.hpp
+++ b/include/cute/atom/mma_traits_sm100.hpp
@@ -37,10 +37,13 @@
 #include <cute/arch/mma_sm100.hpp>
 #include <cute/arch/mma_sm100_desc.hpp>
 #include <cute/arch/mma_sm100_umma.hpp>
-#include <cute/atom/copy_traits_sm100.hpp>            // cute::TMEM::
+#include <cute/arch/tmem_allocator_sm100.hpp>         // cute::TMEM::
+
 #include <cute/atom/mma_traits.hpp>
 #include <cute/atom/mma_traits_sm90_gmma.hpp>         // cute::GMMA::
 #include <cute/atom/mma_traits_sm90_gmma_sparse.hpp>  // cute::GMMA::
+#include <cute/atom/copy_traits_sm100.hpp>            // UTCCP smem desc
+
 #include <cute/numeric/numeric_types.hpp>
 
 // Check that aggregate initialization in .with() initializes all fields
@@ -417,6 +420,9 @@ constexpr auto get_utccp_smem_desc_tensor(Tensor<TEngine, TLayout> const& smem_u
 
 namespace UMMA {
 
+// Import TMEM constants
+namespace TMEM = cute::TMEM;
+
 enum class TmemAllocMode {
   // Default allocation mode.
   // If a TMEM Atom uses a half-subpartition (16DPs), then multiple atoms can be
@@ -3053,7 +3059,7 @@ struct MMA_Traits<SM100_MMA_F8F6F4_2x1SM_SS, a_type, b_type, c_type,
   static_assert(cute::sizeof_bits_v<a_type> <= 8 && cute::sizeof_bits_v<b_type> <= 8, "SM100_MMA_F8F6F4_2x1SM_SS supports types with leq 8bit types");
   static_assert(M == 128 || M == 256, "SM100_MMA_F8F6F4_2x1SM_SS M-mode size should be 64 or 128 for 1 CTA cluster MMA.");
   static_assert((N % 32 == 0) && (32 <= N) && (N <= 256), "SM100_MMA_F8F6F4_2x1SM_SS N-mode size should be a multiple of 32 between 32 and 256.");
- 
+
   using FrgTypeA = UMMA::smem_desc<a_major>;
   using FrgTypeB = UMMA::smem_desc<b_major>;
   using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp
index 187b7e41..e3dd6d27 100644
--- a/include/cute/container/tuple.hpp
+++ b/include/cute/container/tuple.hpp
@@ -51,8 +51,8 @@
 // but do _not_ include references like int& or float&.
 // (See std::tie for an example of a tuple of references.)
 //
-// Standard-layout types preserve ABI across host-device boundaries.
-// They are safe to use as device kernel parameters.
+// Standard-layout types preserve ABI across host-device boundaries. They are safe to use as device kernel parameters.
+// The standard-layout requirement prevents a more common EBO-based implemented of cute::tuple.
 //
 // The cute::tuple is also simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
 // the conversion SFINAE, special overloading, and avoiding cvref template types.
@@ -62,12 +62,15 @@
 namespace cute
 {
 
-namespace detail
+template <class... T>
+struct tuple;
+
+namespace eso
 {
 
 // ESO stands for "empty structure optimization."
-// We use this technique to ensure that cute::tuple
-// doesn't waste space storing template arguments that have no data (like integral_constant).
+// We use this technique to ensure that cute::tuple doesn't waste space
+// storing template arguments that have no data (like integral_constant).
 // Empty types in the template argument list are not even constructed,
 // and do not have unique element addresses. Calling `get`
 // constructs and returns an instance of an empty type on demand.
@@ -131,94 +134,92 @@ struct ESO<false, false, First, Rest...> {
 };
 
 // Get Nth value from ESO
-template <size_t N, bool F, bool R, class T, class... Rest>
+template <class R, size_t N, class S>
 CUTE_HOST_DEVICE constexpr
-cute::enable_if_t<cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
-                                 cute::tuple_element_t<N, cute::type_list<T, Rest...>>>
-getv(ESO<F, R, T, Rest...> const&)
-{
-  return {};
-}
-
-template <size_t N, bool F, bool R, class T, class... Rest>
-CUTE_HOST_DEVICE constexpr
-cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
-                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> const&>
-getv(ESO<F, R, T, Rest...> const& s)
+R
+getr(S&& s) noexcept
 {
   if constexpr (N == 0) {
-    return static_cast<T const&>(s.first_);
+    return static_cast<S&&>(s).first_;
   } else {
-    return getv<N-1>(s.rest_);
+    return getr<R,N-1>(static_cast<S&&>(s).rest_);
   }
-
   CUTE_GCC_UNREACHABLE;
 }
 
-template <size_t N, bool F, bool R, class T, class... Rest>
+// Compilers disagree on decltype(auto), so these implementations avoid it at cost
+template <size_t N, bool F, bool R, class... T>
 CUTE_HOST_DEVICE constexpr
-cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
-                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> &>
-getv(ESO<F, R, T, Rest...>& s)
+cute::conditional_t<cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value,
+                    cute::tuple_element_t<N, cute::tuple<T...>>,
+                    cute::tuple_element_t<N, cute::tuple<T...>> const&>
+getv_cr(ESO<F, R, T...> const& s) noexcept
 {
-  if constexpr (N == 0) {
-    return static_cast<T&>(s.first_);
+  if constexpr (cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value) {
+    return {};
   } else {
-    return getv<N-1>(s.rest_);
+    return getr<cute::tuple_element_t<N, cute::tuple<T...>> const&, N>(s);
   }
-
   CUTE_GCC_UNREACHABLE;
 }
 
-template <size_t N, bool F, bool R, class T, class... Rest>
+template <size_t N, bool F, bool R, class... T>
 CUTE_HOST_DEVICE constexpr
-cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
-                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> &&>
-getv(ESO<F, R, T, Rest...>&& s)
+cute::conditional_t<cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value,
+                    cute::tuple_element_t<N, cute::tuple<T...>>,
+                    cute::tuple_element_t<N, cute::tuple<T...>> &>
+getv_r(ESO<F, R, T...>& s) noexcept
 {
-  if constexpr (N == 0) {
-    return static_cast<T&&>(s.first_);
+  if constexpr (cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value) {
+    return {};
   } else {
-    return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_));
+    return getr<cute::tuple_element_t<N, cute::tuple<T...>> &, N>(s);
   }
-
   CUTE_GCC_UNREACHABLE;
 }
 
-template <class X, size_t N,
-          bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
+template <size_t N, bool F, bool R, class... T>
 CUTE_HOST_DEVICE constexpr
-auto
-findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
+cute::conditional_t<cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value,
+                    cute::tuple_element_t<N, cute::tuple<T...>>,
+                    cute::tuple_element_t<N, cute::tuple<T...>> &&>
+getv_rr(ESO<F, R, T...>&& s) noexcept
 {
-  if constexpr (cute::is_same_v<X, First>) {
-    return C<N>{};
-  } else
-  if constexpr (sizeof...(Rest) == 0) {
-    return C<N+1>{};
-  } else
-  if constexpr (IsRestEmpty) {
-    return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
+  if constexpr (cute::is_empty<cute::tuple_element_t<N, cute::tuple<T...>>>::value) {
+    return {};
   } else {
-    return cute::detail::findt<X, N+1>(t.rest_);
+    return getr<cute::tuple_element_t<N, cute::tuple<T...>> &&, N>(static_cast<ESO<F, R, T...>&&>(s));
   }
+  CUTE_GCC_UNREACHABLE;
 }
 
-} // end namespace detail
+} // end namespace eso
 
 template <class... T>
-struct tuple : detail::ESO_t<T...>
+struct tuple : eso::ESO_t<T...>
 {
   CUTE_HOST_DEVICE constexpr
   tuple() {}
 
   CUTE_HOST_DEVICE constexpr
-  tuple(T const&... t) : detail::ESO_t<T...>(t...) {}
+  tuple(T const&... t) : eso::ESO_t<T...>(t...) {}
 };
 
 template <>
 struct tuple<> {};
 
+//
+// make_tuple (value-based implementation)
+//
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+tuple<T...>
+make_tuple(T const&... t)
+{
+  return {t...};
+}
+
 // Returns the element in the ith position of the tuple
 template <size_t I, class... T>
 CUTE_HOST_DEVICE constexpr
@@ -226,7 +227,7 @@ decltype(auto)
 get(tuple<T...> const& t) noexcept
 {
   static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
+  return eso::getv_cr<I>(t);
 }
 
 template <size_t I, class... T>
@@ -235,7 +236,7 @@ decltype(auto)
 get(tuple<T...>& t) noexcept
 {
   static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
+  return eso::getv_r<I>(t);
 }
 
 template <size_t I, class... T>
@@ -244,22 +245,22 @@ decltype(auto)
 get(tuple<T...>&& t) noexcept
 {
   static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
+  return eso::getv_rr<I>(static_cast<eso::ESO_t<T...>&&>(t));
 }
 
-// Returns the position of type X (as a static integer) in the tuple
-// type's argument list.  X must be unique in the argument list.
+// Returns the first position of type X (as a static integer) in the tuple
+// type's argument list.
 template <class X, class... T>
 CUTE_HOST_DEVICE constexpr
 auto
-find(tuple<T...> const& t) noexcept
+find(tuple<T...> const&) noexcept
 {
-  return detail::findt<X, 0>(t);
+  return cute::C<find_true_v<cute::is_same_v<X,T>...>>{};
 }
 
 //
 // Custom is_tuple trait simply checks the existence of tuple_size
-//      and assumes std::get<I>(.), std::tuple_element<I,.>
+//      and assumes get<I>(.), tuple_element<I,.>
 //
 namespace detail {
 
@@ -273,19 +274,7 @@ template <class T>
 struct is_tuple : decltype(detail::has_tuple_size((T*)0)) {};
 
 template <class T>
-constexpr bool is_tuple_v = cute::is_tuple<T>::value;
-
-//
-// make_tuple (value-based implementation)
-//
-
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-tuple<T...>
-make_tuple(T const&... t)
-{
-  return {t...};
-}
+static constexpr bool is_tuple_v = cute::is_tuple<T>::value;
 
 //
 // tuple_cat concatenates multiple cute::tuple into a single cute::tuple,
diff --git a/include/cute/container/type_list.hpp b/include/cute/container/type_list.hpp
index b8ac5f0d..dfffbe25 100644
--- a/include/cute/container/type_list.hpp
+++ b/include/cute/container/type_list.hpp
@@ -31,6 +31,7 @@
 #pragma once
 
 #include <cute/config.hpp>            // CUTE_HOST_DEVICE, CUTE_STL_NAMESPACE
+#include <cute/util/type_traits.hpp>
 
 namespace cute
 {
@@ -39,11 +40,35 @@ template <class... T>
 struct type_list {};
 
 // get<I> for type_list<T...>
-//   requires tuple_element_t<I,type_list<T...>> to have std::is_default_constructible
+//   Get an instance of the Ith type in the pack T...
+//   Requires tuple_element_t<I,type_list<T...>> to have std::is_default_constructible
 template <size_t I, class... T>
 CUTE_HOST_DEVICE constexpr
 CUTE_STL_NAMESPACE::tuple_element_t<I, type_list<T...>>
-get(type_list<T...> const& t) noexcept {
+get(type_list<T...> const&) noexcept {
+  return {};
+}
+
+// Find the index of the first true in the pack B...
+template <bool... B>
+struct find_true {
+  CUTE_HOST_DEVICE static constexpr size_t find() {
+    size_t i = 0;
+    (void) ((B ? true : (++i, false)) || ...);
+    return i;
+  }
+  static constexpr size_t value = find();
+};
+
+template <bool... B>
+static constexpr size_t find_true_v = find_true<B...>::value;
+
+// find<X> for type_list<T...>
+//   Finds the first position of type X (as a static integer) in the T... pack
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+CUTE_STL_NAMESPACE::integral_constant<size_t, find_true_v<cute::is_same_v<X,T>...>>
+find(type_list<T...> const&) noexcept {
   return {};
 }
 
@@ -69,9 +94,8 @@ struct tuple_size<cute::type_list<T...>>
 
 template <size_t I, class... T>
 struct tuple_element<I, cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
 
 } // end namespace std
 
@@ -94,9 +118,8 @@ struct tuple_size<cute::type_list<T...>>
 
 template <size_t I, class... T>
 struct tuple_element<I, cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
 
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
index 97eafa7a..3f02a41d 100644
--- a/include/cute/layout.hpp
+++ b/include/cute/layout.hpp
@@ -834,7 +834,7 @@ coalesce_x(Layout<Shape,Stride> const& layout)
   } else {
     return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
   }
-
+  
   CUTE_GCC_UNREACHABLE;
 }
 
@@ -1030,7 +1030,7 @@ template <class LShape, class LStride,
           class RShape, class RStride>
 CUTE_HOST_DEVICE constexpr
 auto
-composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
+composition_impl(LShape const& lhs_shape, [[maybe_unused]] LStride const& lhs_stride,
                  RShape const& rhs_shape, RStride const& rhs_stride)
 {
   if constexpr (is_tuple<RShape>::value) {                 // Right-distributivity of Layout composition for RHS tuple
@@ -1067,7 +1067,7 @@ composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
                    auto rest_stride   = get<3>(init);
 
                    auto curr_shape  = get<curr_i>(lhs_shape);
-                   auto curr_stride = get<curr_i>(lhs_stride);
+                   [[maybe_unused]] auto curr_stride = get<curr_i>(lhs_stride);
 
                    // Strong divisibility condition -- requires composition to be statically verifiable.
                    //CUTE_STATIC_ASSERT_V(((rest_stride % curr_shape) == Int<0>{}) or (rest_stride < curr_shape), "Stride Divisibility Condition");
diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp
index 43d3c4b2..ef1ca18e 100644
--- a/include/cute/swizzle_layout.hpp
+++ b/include/cute/swizzle_layout.hpp
@@ -128,8 +128,6 @@ make_fragment_like(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
 // Utilities
 //
 
-namespace detail {
-
 // Get just the Swizzle part of a composed layout.
 template <int B, int M, int S, class Offset, class LayoutB>
 CUTE_HOST_DEVICE constexpr
@@ -167,8 +165,6 @@ get_nonswizzle_portion(Layout<Shape,Stride> const& slayout)
   return slayout;
 }
 
-} // namespace detail
-
 //
 // Slice a Swizzled ComposedLayout
 //
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index c634e884..c9c636a0 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -42,7 +42,7 @@ namespace cutlass {
 namespace arch {
 
 constexpr int sm100_smem_capacity_bytes = 232448;  
-constexpr int sm120_smem_capacity_bytes = 102400;
+constexpr int sm120_smem_capacity_bytes = 101376;
 
 #if defined(__NVCC__) || defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
 
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index d7036baf..3d5ec10b 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -50,6 +50,9 @@
 #define CUTLASS_ARCH_TCGEN_ENABLED 1
 #endif
 
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED))
+#define CUTLASS_ARCH_TCGEN_ENABLED 1
+#endif
 
 namespace cutlass {
 /// @brief
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
index 1dd27f78..e5daf829 100644
--- a/include/cutlass/arch/config.h
+++ b/include/cutlass/arch/config.h
@@ -92,6 +92,14 @@
       #define CUTLASS_ARCH_MMA_SM100A_ENABLED 1
     #endif
 
+    // SM100f
+    #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+    #define CUTLASS_ARCH_MMA_SM100F_SUPPORTED 1
+    #endif
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && CUDA_ARCH_FAMILY(1000))
+      #define CUTLASS_ARCH_MMA_SM100F_ENABLED CUTLASS_ARCH_MMA_SM100F_SUPPORTED
+    #endif
   #endif
 #endif
 
@@ -109,6 +117,14 @@
       #define CUTLASS_ARCH_MMA_SM101A_ENABLED 1
     #endif
 
+    // SM101f
+    #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+    #define CUTLASS_ARCH_MMA_SM101F_SUPPORTED 1
+    #endif
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) && CUDA_ARCH_FAMILY(1010))
+      #define CUTLASS_ARCH_MMA_SM101F_ENABLED CUTLASS_ARCH_MMA_SM101F_SUPPORTED
+    #endif
   #endif
 #endif
 
@@ -124,12 +140,21 @@
       #define CUTLASS_ARCH_MMA_SM120A_ENABLED 1
     #endif
 
+    // SM120f
+    #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+    #define CUTLASS_ARCH_MMA_SM120F_SUPPORTED 1
+    #endif
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) && CUDA_ARCH_FAMILY(1200))
+      #define CUTLASS_ARCH_MMA_SM120F_ENABLED CUTLASS_ARCH_MMA_SM120F_SUPPORTED
+    #endif
   #endif
 #endif
 
 
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
-     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
 #  define CUTLASS_ARCH_CLC_ENABLED
 #endif
 
diff --git a/include/cutlass/arch/grid_dependency_control.h b/include/cutlass/arch/grid_dependency_control.h
index ae66de27..e7defb5d 100644
--- a/include/cutlass/arch/grid_dependency_control.h
+++ b/include/cutlass/arch/grid_dependency_control.h
@@ -53,6 +53,20 @@
   #endif
 #endif
 
+#ifndef CUTLASS_GDC_ENABLED
+  #if(CUDA_BARRIER_ENABLED && \
+    defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
+    defined(__CUDA_ARCH__) && \
+    ((__CUDA_ARCH__ == 1000 &&\
+        (defined(__CUDA_ARCH_FEAT_SM100_ALL) || CUDA_ARCH_FAMILY(1000))) || \
+     (__CUDA_ARCH__ == 1010 &&\
+        (defined(__CUDA_ARCH_FEAT_SM101_ALL) || CUDA_ARCH_FAMILY(1010))) || \
+     (__CUDA_ARCH__ == 1200 &&\
+        (defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_FAMILY(1200)))))
+    #define CUTLASS_GDC_ENABLED
+  #endif
+#endif
+
 namespace cutlass {
 namespace arch {
 
@@ -84,6 +98,5 @@ static constexpr bool IsGdcGloballyEnabled = true;
 static constexpr bool IsGdcGloballyEnabled = false;
 #endif
 
-
 } // namespace arch
 } // namespace cutlass
diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
index 557643e5..a65ee328 100644
--- a/include/cutlass/arch/reg_reconfig.h
+++ b/include/cutlass/arch/reg_reconfig.h
@@ -47,6 +47,14 @@
     #define CUDA_CTA_RECONFIG_ACTIVATED 1
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (          \
+         (__CUDA_ARCH__ == 1000 && CUDA_ARCH_FAMILY(1000))  \
+      || (__CUDA_ARCH__ == 1010 && CUDA_ARCH_FAMILY(1010))  \
+      || (__CUDA_ARCH__ == 1200 && CUDA_ARCH_FAMILY(1200))  \
+    )
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+
 #endif
 
 namespace cutlass {
diff --git a/include/cutlass/conv/collective/builders/sm100_umma_builder.inl b/include/cutlass/conv/collective/builders/sm100_umma_builder.inl
index db1f7dae..9a9d4cb4 100644
--- a/include/cutlass/conv/collective/builders/sm100_umma_builder.inl
+++ b/include/cutlass/conv/collective/builders/sm100_umma_builder.inl
@@ -168,7 +168,7 @@ private:
 
   // Calculate SMEM matrix A and B buffers' pipeline stages
   static constexpr uint32_t AccumulatorPipelineStageCount = 2;
-  static constexpr uint32_t SchedulerPipelineStageCount = 2;
+  static constexpr uint32_t SchedulerPipelineStageCount = 1;
   static constexpr uint32_t CLCResponseSize = 16;
 
   // AccumulatorPipeline = PipelineUmmaAsync
@@ -179,8 +179,6 @@ private:
   static constexpr auto LoadOrderBarrierStorage = sizeof(typename cutlass::OrderedSequenceBarrier<1,2>::SharedStorage);
   // CLC (scheduler) response
   static constexpr auto CLCResponseStorage = SchedulerPipelineStageCount * CLCResponseSize;
-  // CLC Throttle pipeline storage
-  static constexpr auto CLCThrottlePipelineStorage = sizeof(typename cutlass::PipelineAsync<SchedulerPipelineStageCount>::SharedStorage);
   // Tmem dealloc
   static constexpr auto TmemDeallocStorage = sizeof(cutlass::arch::ClusterBarrier);
   // Tmem ptr storage
@@ -190,7 +188,6 @@ private:
                                                                CLCPipelineStorage +
                                                                LoadOrderBarrierStorage +
                                                                TmemDeallocStorage +
-                                                               CLCThrottlePipelineStorage +
                                                                CLCResponseStorage +
                                                                TmemBasePtrsStorage);
   // Reduce SMEM capacity available for buffers considering barrier allocations.
@@ -204,7 +201,12 @@ private:
   constexpr static int NumSpatialDimensions = detail::gmem_layout_tags_to_spatial_dims<GmemLayoutA, GmemLayoutB>();
 
   using DispatchPolicy = cutlass::conv::MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
-      ConvOp, PipelineStages, NumSpatialDimensions, ClusterShape_MNK>;
+      ConvOp,
+      PipelineStages,
+      NumSpatialDimensions,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape_MNK>;
 
 public:
   using CollectiveOp = cutlass::conv::collective::CollectiveConv<
diff --git a/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp b/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
index dc75b988..278f69f9 100644
--- a/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
+++ b/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
@@ -28,9 +28,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-//
 
-//
 
 #pragma once
 
@@ -66,6 +64,8 @@ template <
   conv::Operator ConvOp,
   int Stages,
   int NumSpatialDims,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
   class ClusterShape,    // Static cluster shape or dynamic (int, int, _1)
   class TileShapeMNKL_,  // (MmaAtomShapeM, MmaAtomShapeN, TileK, optional: TileL)
   class ElementA_,
@@ -75,7 +75,12 @@ template <
   class TileTraitsB_>
 struct CollectiveConv<
     MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
-        ConvOp, Stages, NumSpatialDims, ClusterShape>,
+      ConvOp,
+      Stages,
+      NumSpatialDims,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
     TileShapeMNKL_,
     ElementA_,
     ElementB_,
@@ -87,7 +92,12 @@ struct CollectiveConv<
   // Type Aliases
   //
   using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
-      ConvOp, Stages, NumSpatialDims, ClusterShape>;
+                           ConvOp,
+                           Stages,
+                           NumSpatialDims,
+                           SchedulerPipelineStageCount,
+                           AccumulatorPipelineStageCount,
+                           ClusterShape>;
   using TileShape = decltype(cute::take<0,3>(TileShapeMNKL_{})); // (MmaAtomShapeM, MmaAtomShapeN, TileK)
   using ElementA = ElementA_;
   using ElementB = ElementB_;
@@ -348,10 +358,12 @@ public:
   // Constructor
   //
   CUTLASS_DEVICE
-  CollectiveConv(Params const& params) {
+  CollectiveConv(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
     if constexpr (IsDynamicCluster) {
-      dim3 cs = cute::cluster_shape();
-      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
       observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
       observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
     }
@@ -648,28 +660,14 @@ public:
   }
 
   /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE static void
-  prefetch_tma_descriptors(Params const& mainloop_params) {
-    if constexpr (IsDynamicCluster) {
-      dim3 cs = cute::cluster_shape();
-      const bool is_fallback_cluster = (cs.x == mainloop_params.cluster_shape_fallback.x && cs.y == mainloop_params.cluster_shape_fallback.y);
-      if (is_fallback_cluster) {
-        cute::prefetch_tma_descriptor(mainloop_params.tma_load_a_fallback.get_tma_descriptor());
-        cute::prefetch_tma_descriptor(mainloop_params.tma_load_b_fallback.get_tma_descriptor());
-      }
-      else {
-        cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-        cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-      }
-    }
-    else {
-      cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-      cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-    }
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
   }
 
   /// Construct A Single Stage's Accumulator Shape
-  CUTLASS_DEVICE auto
+  CUTLASS_DEVICE static auto
   partition_accumulator_shape() {
     auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
 
@@ -794,11 +792,10 @@ public:
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
 
-    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
-    Layout cta_layout_mnk  = make_layout(cluster_shape);
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
     Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
-    int block_rank_in_cluster = cute::block_rank_in_cluster();
-    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster);
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
 
     // Project the cta_layout for tma_a along the n-modes
     auto [tAgA_mk, tAsA] = tma_partition(*observed_tma_load_a_,
@@ -890,7 +887,7 @@ public:
   }
 
   CUTLASS_DEVICE auto
-  mma_init(TensorStorage& shared_tensors) {
+  mma_init(TensorStorage& shared_tensors) const {
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
 
@@ -909,6 +906,9 @@ private:
 
   typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
   typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/dispatch_policy.hpp b/include/cutlass/conv/dispatch_policy.hpp
index b4bf8a53..d569cb1c 100644
--- a/include/cutlass/conv/dispatch_policy.hpp
+++ b/include/cutlass/conv/dispatch_policy.hpp
@@ -86,7 +86,10 @@ struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
 
 
 // SM100 tensor op kernel schedule
-struct KernelImplicitTmaWarpSpecializedSm100 { };
+struct KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = 0;
+  static constexpr int AccumulatorPipelineStageCount = 0;
+};
 
 // Pseudo-policies for builder auto override that dispatches to the KernelImplicitTmaWarpSpecializedSm100
 // but for opting into 1 or 2 SM atoms
@@ -96,11 +99,23 @@ struct KernelImplicitTmaWarpSpecialized2SmSm100 : KernelImplicitTmaWarpSpecializ
 struct KernelStridedDgradTmaWs1SmSm100 { };
 struct KernelStridedDgradTmaWs2SmSm100 { };
 
+// Policy for implicit gemm kernel
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelScheduleImplicitTmaWarpSpecializedSm100 : KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
 // n-buffer in smem (Blackwell TMA), pipelined with Blackwell UMMA and TMA, fprop
 template<
   conv::Operator ConvOp_,
   int Stages_,
   int NumSpatialDimensions_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
   class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>
 >
 struct MainloopSm100TmaUmmaWarpSpecializedImplicitGemm {
@@ -109,7 +124,7 @@ struct MainloopSm100TmaUmmaWarpSpecializedImplicitGemm {
   static constexpr Operator ConvOp = ConvOp_;
   using ClusterShape = ClusterShape_;
   using ArchTag = arch::Sm100;
-  using Schedule = KernelImplicitTmaWarpSpecializedSm100;
+  using Schedule = KernelScheduleImplicitTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
 
   static_assert(NumSpatialDimensions >= 1);
 }; 
diff --git a/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp b/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp
index 90236e1f..0874d8f8 100644
--- a/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -110,7 +108,8 @@ public:
   static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
   // TileID scheduler
   // CLC pipeline depth determines how many waves (stages-1) the scheduler can race ahead
-  static constexpr uint32_t SchedulerPipelineStageCount = 2;
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
 
   using TileSchedulerTag = TileSchedulerTag_;
   using TileScheduler = typename cutlass::gemm::kernel::detail::TileSchedulerSelector<
@@ -135,7 +134,6 @@ public:
   static constexpr uint32_t NumFixupBarriers = 1;
 
   // Pipelines and pipeline states
-  static constexpr uint32_t AccumulatorPipelineStageCount = SchedulerPipelineStageCount;
   static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
 
   // Pipeline and pipeline state types
@@ -157,10 +155,6 @@ public:
   using CLCPipelineState = cutlass::PipelineDetail::PipelineCLCFetchAsyncPipelineState<SchedulerPipelineStageCount>;
   using CLCPipelineSharedStorage = cutlass::PipelineDetail::PipelineCLCFetchAsyncSharedStorage<SchedulerPipelineStageCount>;
 
-  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
-  using CLCThrottlePipelineState = cutlass::PipelineDetail::PipelineAsyncPipelineState<SchedulerPipelineStageCount>;
-  using CLCThrottlePipelineSharedStorage = cutlass::PipelineDetail::PipelineAsyncSharedStorage<SchedulerPipelineStageCount>;
-
   using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
       cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
 
@@ -172,14 +166,12 @@ public:
       using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
       using CLCPipelineStorage = CLCPipelineSharedStorage;
       using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
-      using CLCThrottlePipelineStorage = CLCThrottlePipelineSharedStorage;
 
       alignas(16) MainloopPipelineStorage mainloop;
       alignas(16) EpiLoadPipelineStorage epi_load;
       alignas(16) LoadOrderBarrierStorage load_order;
       alignas(16) CLCPipelineStorage clc;
       alignas(16) AccumulatorPipelineStorage accumulator;
-      alignas(16) CLCThrottlePipelineStorage clc_throttle;
       alignas(16) arch::ClusterBarrier tmem_dealloc;
     } pipelines;
 
@@ -193,7 +185,6 @@ public:
       EpilogueTensorStorage epilogue;
       MainloopTensorStorage mainloop;
     } tensors;
-
   };
 
   static constexpr int SharedStorageSize = sizeof(SharedStorage);
@@ -207,7 +198,7 @@ public:
     KernelHardwareInfo hw_info{};
     TileSchedulerArguments scheduler{};
   };
-  
+
   // Kernel device entry point API
   struct Params {
     using ProblemShapeMNKL = decltype(CollectiveMainloop::get_problem_shape_MNKL(ProblemShape{}));
@@ -398,7 +389,7 @@ public:
                                                                                      : WarpCategory::Epilogue;
 
     uint32_t lane_predicate = cute::elect_one_sync();
-    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
     int cluster_size = size(cluster_shape);
     uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
     bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
@@ -407,24 +398,23 @@ public:
     constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
     [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
 
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-    }
-    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
     // Kernel level shared memory storage
     SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
 
     // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop(params.mainloop);
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
     CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
 
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
     // Do we load source tensor C or other aux inputs
     bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
-
     IsParticipant is_participant = {
       (warp_category == WarpCategory::MMA),                                 // mma
       (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
@@ -462,7 +452,7 @@ public:
     epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
     epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
     epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
-    epi_load_pipeline_params.initializing_warp = 4;
+    epi_load_pipeline_params.initializing_warp = 1;
     EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
 
     // Epilogue Store pipeline
@@ -474,7 +464,7 @@ public:
     typename LoadOrderBarrier::Params load_order_barrier_params;
     load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
     load_order_barrier_params.group_size = NumMainloopLoadThreads;
-    load_order_barrier_params.initializing_warp = 5;
+    load_order_barrier_params.initializing_warp = 3;
     LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
 
     // CLC pipeline
@@ -493,7 +483,7 @@ public:
       clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
     }
     clc_pipeline_params.transaction_bytes = CLCResponseSize;
-    clc_pipeline_params.initializing_warp = 1;
+    clc_pipeline_params.initializing_warp = 4;
     CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
 
     // Mainloop-Epilogue pipeline
@@ -507,29 +497,13 @@ public:
     // Only one producer thread arrives on this barrier.
     accumulator_pipeline_params.producer_arv_count = 1;
     accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
-    accumulator_pipeline_params.initializing_warp = 2;
+    accumulator_pipeline_params.initializing_warp = 5;
     AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
                                              accumulator_pipeline_params,
                                              cluster_shape,
                                              cute::true_type{},   // Perform barrier init
                                              cute::false_type{}); // Delay mask calculation
 
-    // CLC throttle pipeline
-    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
-    if (WarpCategory::MainloopLoad == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
-    }
-    if (WarpCategory::Sched == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
-    }
-    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
-    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
-    clc_throttle_pipeline_params.dst_blockid = 0;
-    clc_throttle_pipeline_params.initializing_warp = 3;
-    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
-    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
-    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
-
     // Tmem allocator
     TmemAllocator tmem_allocator{};
 
@@ -544,12 +518,10 @@ public:
 
     // We need this to guarantee that the Pipeline init is visible
     // To all producers and consumer threadblocks in the cluster
-    if (cluster_size > 1) {
-      cute::cluster_arrive_relaxed();
-    }
-    else {
-      __syncthreads();
-    }
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_init(
+      problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
 
     uint32_t tmem_stage_ptrs[AccumulatorPipelineStageCount];
     MainloopPipelineState mainloop_pipe_consumer_state;
@@ -571,7 +543,7 @@ public:
 
     // Calculate mask after cluster barrier arrival
     mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
-    accumulator_pipeline.init_masks(cluster_shape);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
 
     // TileID scheduler
     TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, problem_shape_MNKL, TileShape{}, block_id_in_cluster);
@@ -583,58 +555,13 @@ public:
     int TmemColumnsPerAccumulatorTile = cutlass::detail::find_tmem_tensor_col_offset(accumulators);
     pipeline_init_wait(cluster_size);
 
-    if (is_participant.sched) {
-
-      // Whether a new CLC query must be performed.
-      // See comment below where this variable is updated for a description of
-      // why this variable is needed.
-      bool requires_clc_query = true;
-
-      do {
-        if (requires_clc_query) {
-          // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
-          clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
-          clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
-          ++clc_pipe_throttle_consumer_state;
-
-          // Query next clcID and update producer state
-          clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
-        }
-
-        // Fetch next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
-          work_tile_info,
-          clc_pipeline,
-          clc_pipe_consumer_state
-        );
-
-        // Only perform a new CLC query if we consumed a new CLC query result in
-        // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
-        // not consume a new CLC query response is when processing stream-K units.
-        // The current stream-K scheduler uses single WorkTileInfo to track multiple
-        // (potentially-partial) tiles to be computed via stream-K. In this case,
-        // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
-        // rather than consuming a CLC query response.
-        requires_clc_query = increment_pipe;
-        if (increment_pipe) {
-          ++clc_pipe_consumer_state;
-        }
-
-        work_tile_info = next_work_tile_info;
-      } while (work_tile_info.is_valid());
-      clc_pipeline.producer_tail(clc_pipe_producer_state);
-    }
-    else if (is_participant.main_load) {
-
+    if (is_participant.main_load) {
       // Ensure that the prefetched kernel does not touch
       // unflushed global memory prior to this instruction
       cutlass::arch::wait_on_dependent_grids();
 
       bool do_load_order_arrive = is_epi_load_needed;
-      auto load_inputs = collective_mainloop.load_init(
-          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
       Tensor gA_mk = get<0>(load_inputs);
-      bool requires_clc_query = true;
 
       do {
         // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
@@ -642,12 +569,6 @@ public:
         auto k_tile_count = scheduler.get_work_k_tile_count(work_tile_info, problem_shape_MNKL, TileShape{});
         auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
 
-        if (is_first_cta_in_cluster && requires_clc_query) {
-          clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
-          clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
-          ++clc_pipe_throttle_producer_state;
-        }
-
         auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
           params.mainloop,
           mainloop_pipeline,
@@ -683,7 +604,6 @@ public:
         );
         work_tile_info = next_work_tile_info;
         cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
-        requires_clc_query = increment_pipe;
         if (increment_pipe) {
           ++clc_pipe_consumer_state;
         }
@@ -691,60 +611,43 @@ public:
       collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
 
     }
-    else if (is_participant.epi_load) {
 
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
+    else if (is_participant.sched) {
+      // Whether a new CLC query must be performed.
+      // See comment below where this variable is updated for a description of
+      // why this variable is needed.
+      bool requires_clc_query = true;
 
-      bool do_load_order_wait = true;
-      bool do_tail_load = false;
       do {
-        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        if (requires_clc_query) {
+          // Query next clcID and update producer state
+          clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+        }
 
-        // Get current work tile and fetch next work tile
+        // Fetch next work tile
         auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
           work_tile_info,
           clc_pipeline,
           clc_pipe_consumer_state
         );
-        work_tile_info = next_work_tile_info;
 
+        // Only perform a new CLC query if we consumed a new CLC query result in
+        // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+        // not consume a new CLC query response is when processing stream-K units.
+        // The current stream-K scheduler uses single WorkTileInfo to track multiple
+        // (potentially-partial) tiles to be computed via stream-K. In this case,
+        // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+        // rather than consuming a CLC query response.
+        requires_clc_query = increment_pipe;
         if (increment_pipe) {
           ++clc_pipe_consumer_state;
         }
 
-        if (compute_epilogue) {
-
-          if (do_load_order_wait) {
-            load_order_barrier.wait();
-            do_load_order_wait = false;
-          }
-
-          epi_load_pipe_producer_state = collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            CtaShape_MNK{},
-            cta_coord_mnkl,
-            TileShape{},
-            TiledMma{},
-            shared_storage.tensors.epilogue
-          );
-
-          do_tail_load = true;
-        }
-
-        // Calculate the cta coordinates of the next work tile
-        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        work_tile_info = next_work_tile_info;
       } while (work_tile_info.is_valid());
-
-      if (do_tail_load) {
-        collective_epilogue.load_tail(
-          epi_load_pipeline, epi_load_pipe_producer_state,
-          epi_store_pipeline, epi_store_pipe_producer_state);
-      }
+      clc_pipeline.producer_tail(clc_pipe_producer_state);
     }
+
     else if (is_participant.mma) {
       // Tmem allocation sequence
       tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
@@ -757,6 +660,7 @@ public:
         tmem_stage_ptrs[acc_stage] = tmem_base_ptr + (TmemColumnsPerAccumulatorTile * acc_stage) & cutlass::detail::TmemColMask;
       }
       auto mma_inputs = collective_mainloop.mma_init(shared_storage.tensors.mainloop);
+
       do {
         auto k_tile_count = scheduler.get_work_k_tile_count(work_tile_info, problem_shape_MNKL, TileShape{});
 
@@ -788,7 +692,6 @@ public:
             mma_inputs,
             k_tile_count
           );
-
           accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
         }
         ++accumulator_pipe_producer_state;
@@ -802,6 +705,7 @@ public:
 
       // Release the right to allocate before deallocations so that the next CTA can rasterize
       tmem_allocator.release_allocation_lock();
+
       // Leader MMA waits for leader + peer epilogues to release accumulator stage
       if (is_mma_leader_cta) {
         accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
@@ -816,8 +720,66 @@ public:
 
       // Free entire tmem allocation
       tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
-
     }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
     else if (is_participant.epilogue) {
       // Wait for tmem allocate here
       tmem_allocation_result_barrier.arrive_and_wait();
@@ -875,13 +837,16 @@ public:
           epi_load_pipe_consumer_state = load_state_next;
           epi_store_pipe_producer_state = store_state_next;
           accumulator_pipe_consumer_state = acc_state_next;
-
           do_tail_store = true;
         }
         work_tile_info = next_work_tile_info;
         cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
       } while (work_tile_info.is_valid());
 
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
       if (do_tail_store) {
         collective_epilogue.store_tail(
           epi_load_pipeline, epi_load_pipe_consumer_state,
@@ -889,19 +854,8 @@ public:
           CtaShape_MNK{});
       }
     }
-  }
 
-private:
-
-  // Synchronization call. Blocks until barriers are initialized in shared memory.
-  CUTLASS_DEVICE
-  void
-  pipeline_init_wait(int cluster_size) {
-    if (cluster_size > 1) {
-      cute::cluster_wait();
-    }
     else {
-      __syncthreads();
     }
   }
 };
diff --git a/include/cutlass/detail/sm100_blockwise_scale_layout.hpp b/include/cutlass/detail/blockwise_scale_layout.hpp
similarity index 67%
rename from include/cutlass/detail/sm100_blockwise_scale_layout.hpp
rename to include/cutlass/detail/blockwise_scale_layout.hpp
index 8f75bd25..2d545bbd 100644
--- a/include/cutlass/detail/sm100_blockwise_scale_layout.hpp
+++ b/include/cutlass/detail/blockwise_scale_layout.hpp
@@ -179,11 +179,110 @@ struct Sm100BlockwiseScaleConfig {
 
 };
 
+template<UMMA::Major majorSFA = UMMA::Major::MN, UMMA::Major majorSFB = UMMA::Major::MN>
+struct RuntimeBlockwiseScaleConfig {
+
+  using ShapeSFA = Shape<Shape<int32_t, int32_t>, Shape<int32_t, int32_t>, int32_t>;
+  using ShapeSFB = Shape<Shape<int32_t, int32_t>, Shape<int32_t, int32_t>, int32_t>;
+
+  using StrideSFA = conditional_t<majorSFA == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using StrideSFB = conditional_t<majorSFB == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using LayoutSFA = Layout<ShapeSFA, StrideSFA>;
+  using LayoutSFB = Layout<ShapeSFB, StrideSFB>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFA() {
+    return LayoutSFA{};
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFB() {
+    return LayoutSFB{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template <class ProblemShape, class SFVecShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFA(ProblemShape problem_shape, SFVecShape sf_vec_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto [sfm, sfn, sfk] = sf_vec_shape;
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(M, sfm)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, sfk)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [sfm, sfn, sfk] = sf_vec_shape;
+    auto mk_layout = make_layout(
+      make_shape(make_shape(sfm, cute::ceil_div(M, sfm)),
+                 make_shape(sfk, cute::ceil_div(K, sfk))),
+      strides
+    );
+
+    return make_layout(append(shape(mk_layout), L), append(stride(mk_layout), size(filter_zeros(mk_layout))));
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFB.
+  template <class ProblemShape, class SFVecShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFB(ProblemShape problem_shape, SFVecShape sf_vec_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto [sfm, sfn, sfk] = sf_vec_shape;
+
+      if constexpr (majorSFB == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(N, sfn)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, sfk)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [sfm, sfn, sfk] = sf_vec_shape;
+    auto nk_layout = make_layout(
+      make_shape(make_shape(sfn, cute::ceil_div(N, sfn)),
+                 make_shape(sfk, cute::ceil_div(K, sfk))),
+      strides
+    );
+
+    return make_layout(append(shape(nk_layout), L), append(stride(nk_layout), size(filter_zeros(nk_layout))));
+  }
+
+};
+
+// Sm90 only supports MN major for SFA and SFB for now
+template<int SFVecSizeM, int SFVecSizeN, int SFVecSizeK>
+using Sm90BlockwiseScaleConfig = Sm100BlockwiseScaleConfig<SFVecSizeM, SFVecSizeN, SFVecSizeK>;
+
 template<class MmaTileShape_MNK>
 constexpr auto sm100_trivial_blockwise_scale_config(MmaTileShape_MNK) {
   return Sm100BlockwiseScaleConfig<size<0>(MmaTileShape_MNK{}), size<1>(MmaTileShape_MNK{}), size<2>(MmaTileShape_MNK{})>{};
 }
 
+template<class MmaTileShape_MNK>
+constexpr auto sm90_trivial_blockwise_scale_config(MmaTileShape_MNK) {
+  return Sm90BlockwiseScaleConfig<size<0>(MmaTileShape_MNK{}), size<1>(MmaTileShape_MNK{}), size<2>(MmaTileShape_MNK{})>{};
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::detail
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 44a83b9d..cf9b803b 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -208,6 +208,35 @@ namespace cutlass {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// __CUDA_ARCH_SPECIFIC__ is introduced in CUDA 12.9
+#if !defined(CUDA_ARCH_CONDITIONAL)
+
+#if defined(__CUDA_ARCH_SPECIFIC__)
+#define CUDA_ARCH_CONDITIONAL(ARCH_XXYY) (__CUDA_ARCH_SPECIFIC__ == ARCH_XXYY)
+#else
+#define CUDA_ARCH_CONDITIONAL(ARCH_XXYY) (false)
+#endif
+
+#endif
+
+// __CUDA_ARCH_FAMILY_SPECIFIC__ is introduced in CUDA 12.9
+#if !defined(CUDA_ARCH_FAMILY)
+
+#if defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
+#define CUDA_ARCH_FAMILY(ARCH_XXYY) (__CUDA_ARCH_FAMILY_SPECIFIC__ == ARCH_XXYY)
+#else
+#define CUDA_ARCH_FAMILY(ARCH_XXYY) (false)
+#endif
+
+#endif
+
+#if !defined(CUDA_ARCH_CONDITIONAL_OR_FAMILY)
+#define CUDA_ARCH_CONDITIONAL_OR_FAMILY(ARCH_XXYY) \
+  (CUDA_ARCH_CONDITIONAL(ARCH_XXYY) || CUDA_ARCH_FAMILY(ARCH_XXYY))
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }; // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/detail/layout.hpp b/include/cutlass/detail/layout.hpp
index a0a183b0..562adc65 100644
--- a/include/cutlass/detail/layout.hpp
+++ b/include/cutlass/detail/layout.hpp
@@ -33,10 +33,10 @@
 #include "cute/layout.hpp"
 #include "cute/pointer_sparse.hpp"       // cute::is_sparse
 #include "cute/swizzle.hpp"              // cute::Swizzle
-#include "cute/swizzle_layout.hpp"       // cute::detail::get_swizzle_portion
+#include "cute/swizzle_layout.hpp"       // cute::get_swizzle_portion
 #include "cute/util/type_traits.hpp"
 #include "cute/arch/copy_sm90_tma.hpp"
-#include "cute/arch/copy_sm100_tma.hpp"  
+#include "cute/arch/copy_sm100_tma.hpp"
 
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/tensor.h"
@@ -219,8 +219,8 @@ stride_to_layout_tag_A() {
     return layout::ColumnMajor{};
   }
   // Specialize for sparse layout
-  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} && 
-                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 && 
+  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} &&
+                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 &&
                      cute::is_same_v<cute::_1, cute::remove_cvref_t<decltype(cute::get<1,0>(InternalStrideA{}))>>) {
     return layout::ColumnMajor{};
   }
@@ -308,8 +308,8 @@ constexpr bool is_tma_copy_engine() {
                   || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL_MULTICAST,       GmemTiledCopy>
                   || cute::is_base_of_v<cute::SM90_TMA_STORE,                       GmemTiledCopy>
                   || cute::is_base_of_v<cute::SM90_TMA_STORE_IM2COL,                GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD,                   GmemTiledCopy> 
-                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD_MULTICAST,         GmemTiledCopy> 
+                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD,                   GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD_MULTICAST,         GmemTiledCopy>
                   ) {
       return true;
     }
@@ -349,7 +349,7 @@ get_alignment_count_from_gmem_tiled_copy() {
                      cutlass::gemm::collective::detail::is_sm10x_f8f6f4_element<Element>() && cute::is_same_v<typename RawDtype<ElementMma>::type, uint8_t>) {
         return 128;
       }
-      
+
       // For sparse MMA, alignment in logical elements is increased by sparsity factor
       if constexpr (cute::is_sparse_v<ElementMma>) {
         return 128 / sizeof_bits<Element>::value * ElementMma::sparsity;
@@ -366,7 +366,7 @@ get_alignment_count_from_gmem_tiled_copy() {
 // Return alignment bit requirements for the GEMM inputs.
 template <
   class ElementType
-  , bool IsF8F6F4SubBytes=false  
+  , bool IsF8F6F4SubBytes=false
 >
 constexpr int
 get_input_alignment_bits() {
@@ -383,12 +383,12 @@ get_input_alignment_bits() {
 template <class ElementType>
 constexpr int
 get_output_alignment_bits() {
-  
+
   if constexpr (sizeof_bits<ElementType>::value == 6) {
     // U6 format : The inner tensor size dimension must be a multiple of 96B.
     return 96 * 8;
   }
-  
+
   return 128;
 }
 
@@ -424,7 +424,7 @@ template<class Layout>
 CUTLASS_HOST_DEVICE constexpr
 size_t
 alignment_for_swizzle(Layout layout) {
-  return alignment_for_swizzle(cute::detail::get_swizzle_portion(layout));
+  return alignment_for_swizzle(cute::get_swizzle_portion(layout));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/builders/sm120_builder.inl b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
index ad1f44a0..e1c1bff8 100644
--- a/include/cutlass/epilogue/collective/builders/sm120_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
@@ -63,13 +63,27 @@ struct EpilogueSFVecSize<FusionOp, cute::void_t<decltype(FusionOp::SFVecSize)>>
   static constexpr int value = FusionOp::SFVecSize;
 };
 
+// Helper to deduce NumEpilogueWarpGroups based on Schedule
+template <class Schedule, class = void>
+struct GetNumEpilogueWarpGroups {
+  static constexpr int value = 2;
+};
+
+template <class Schedule>
+struct GetNumEpilogueWarpGroups<Schedule, cute::void_t<decltype(Schedule::NumEpilogueWarpGroups)>> {
+  static constexpr int value = Schedule::NumEpilogueWarpGroups;
+};
+
 // Returns the parameterized dispatch policy for the TMA epilogue
-template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class StrideD, class Schedule>
+template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class GmemLayoutTagD, class Schedule>
 constexpr auto
 sm120_get_tma_dispatch_policy() {
   using namespace cute;
 
   constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
+  using StrideD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
+  using InternalStrideD  = cute::remove_pointer_t<StrideD>;
+  constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideD, StrideD>;
 
   // For 120, a FragmentSize of 4 is used to match the
   // output per thread from each MMA. Epilogue subtiles iterate over multiple of these
@@ -86,9 +100,17 @@ sm120_get_tma_dispatch_policy() {
 
   // SM120 epilogues use smaller stage counts in order to fit within the limited shared memory capacity.
   constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 2), StagesD+1)
-                                    : StagesD;
-                                    
-  return Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+                                    : StagesD;  
+
+  constexpr int NumEpilogueWarpGroups = GetNumEpilogueWarpGroups<Schedule>::value;
+
+  if constexpr (IsGroupedGemmKernel) {
+    return Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, 
+                                          DelayTmaStore, NumEpilogueWarpGroups>{};
+  } 
+  else {
+    return Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+  }
 }
 
 // Returns the smem layout atom to be used for C or D matrix
@@ -291,6 +313,9 @@ struct Sm120TmaBuilderImpl {
   using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
   using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
 
+  using UnderlyingGmemStrideTypeC = cute::remove_pointer_t<GmemStrideTypeC>;
+  using UnderlyingGmemStrideTypeD = cute::remove_pointer_t<GmemStrideTypeD>;
+
   using CopyOpS2G =
     cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagD>,
       SM90_TMA_STORE_IM2COL,
@@ -306,15 +331,15 @@ struct Sm120TmaBuilderImpl {
   // Get the smallest tiled copy we can use to retile the accumulators
   using CopyAtomC = Copy_Atom<SM90_U32x2_STSM_N, cutlass::half_t>;
 
-  using SmemLayoutAtomC = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>());
-  using SmemLayoutAtomD = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>());
+  using SmemLayoutAtomC = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>());
+  using SmemLayoutAtomD = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>());
 
-  using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>());
+  using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC>());
 
-  using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>());
+  using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD>());
 
   // Get register to register tiled copy that happen before shared memory store.
-  using CopyOpR2R = decltype(detail::sm120_get_register_transform_op<GmemStrideTypeD, ElementD>());
+  using CopyOpR2R = decltype(detail::sm120_get_register_transform_op<UnderlyingGmemStrideTypeD, ElementD>());
 
   // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
   // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
@@ -334,8 +359,32 @@ struct Sm120TmaBuilderImpl {
   constexpr static bool ReuseSmemC = DispatchPolicy::ReuseSmemC;
   constexpr static bool DelayTmaStore = DispatchPolicy::DelayTmaStore;
 
+  //Helper to deduce BaseDispatchPolicy based on DispatchPolicy
+  template<class T>
+  struct GetBaseDispatchPolicy {
+    using Type = T;
+  };
+
+  template<int StagesC_, int StagesD_, int FragmentSize_, bool ReuseSmemC_, 
+           bool DelayTmaStore_, int NumEpilogueWarpGroups_>
+  struct GetBaseDispatchPolicy<Sm120PtrArrayTmaWarpSpecialized<StagesC_, StagesD_, 
+    FragmentSize_, ReuseSmemC_, DelayTmaStore_, NumEpilogueWarpGroups_>> {
+    using Type = typename cutlass::epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC_, StagesD_, 
+      FragmentSize_, ReuseSmemC_, DelayTmaStore_, NumEpilogueWarpGroups_>;
+  };
+
+  template<int StagesC_, int StagesD_, int FragmentSize_, bool ReuseSmemC_, 
+           bool DelayTmaStore_>
+  struct GetBaseDispatchPolicy<Sm120TmaWarpSpecialized<StagesC_, StagesD_, 
+    FragmentSize_, ReuseSmemC_, DelayTmaStore_>> {
+    using Type = typename cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC_, StagesD_, 
+      FragmentSize_, ReuseSmemC_, DelayTmaStore_>;
+  };
+
+  using BaseDispatchPolicy = typename GetBaseDispatchPolicy<DispatchPolicy>::Type;
+  
   using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
-      Sm90TmaWarpSpecialized<StagesC,StagesD,FragmentSize,ReuseSmemC,DelayTmaStore>,
+      BaseDispatchPolicy,
       TileShape_MNK,
       EpilogueTile_MN,
       ElementC_, // Need to pass void through to expose via GemmUniversal
@@ -394,13 +443,15 @@ struct CollectiveBuilder<
     cute::enable_if_t<cute::is_same_v<Schedule, EpilogueScheduleAuto> ||
                       cute::is_same_v<Schedule, TmaWarpSpecialized> ||
                       cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> ||
+                      cute::is_same_v<Schedule, PtrArrayTmaWarpSpecializedPingpong> ||
+                      cute::is_same_v<Schedule, PtrArrayTmaWarpSpecializedCooperative> ||
                       cute::is_same_v<Schedule, SparseTmaWarpSpecializedCooperativeSm120>
                      >> {
 private:
   using EpilogueTile_MN =
     decltype(detail::sm120_compute_tile_shape_or_override<ElementC, ElementD, EpilogueTileType, Schedule, TileShape_MNK, cutlass::detail::TagToStrideC_t<GmemLayoutTagD>, FusionOperation>());
   using DispatchPolicy =
-    decltype(detail::sm120_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD, cutlass::detail::TagToStrideC_t<GmemLayoutTagD>, Schedule>());
+    decltype(detail::sm120_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD, GmemLayoutTagD, Schedule>());
 
 
 public:
diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp
index b7bd6f40..0d019b1c 100644
--- a/include/cutlass/epilogue/collective/default_epilogue.hpp
+++ b/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -35,6 +35,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/detail.hpp"
 
@@ -225,22 +226,27 @@ public:
       return;
     }
 
+    using FragCType = remove_cvref_t<decltype(tCgC(0))>;
+    using FragDType = remove_cvref_t<decltype(tCgD(0))>;
+
     // source is needed
     if (epilogue_op.is_source_needed()) {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), residue_tCcD)) {
-          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
-        }
+        FragCType fragC;
+        bool pred = elem_less(tCcD(i), residue_tCcD);
+        arch::global_load<FragCType, sizeof(FragCType)>(fragC, &tCgC(i), pred);
+        FragDType fragD = epilogue_op(accumulators(i), fragC);
+        arch::global_store<FragDType, sizeof(FragDType)>(fragD, &tCgD(i), pred);
       }
     }
     // source is not needed, avoid load
     else {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), residue_tCcD)) {
-          tCgD(i) = epilogue_op(accumulators(i));
-        }
+        bool pred = elem_less(tCcD(i), residue_tCcD);
+        FragDType fragD = epilogue_op(accumulators(i));
+        arch::global_store<FragDType, sizeof(FragDType)>(fragD, &tCgD(i), pred);
       }
     }
   }
diff --git a/include/cutlass/epilogue/collective/detail.hpp b/include/cutlass/epilogue/collective/detail.hpp
index 2759d0c6..2c72c301 100644
--- a/include/cutlass/epilogue/collective/detail.hpp
+++ b/include/cutlass/epilogue/collective/detail.hpp
@@ -124,6 +124,23 @@ struct sm90_is_ptr_array_tma_dispatch_policy<
                                    NumEpilogueWarpGroups>> 
     : cute::true_type {};
 
+template<
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups
+>
+struct sm90_is_ptr_array_tma_dispatch_policy<
+    Sm120PtrArrayTmaWarpSpecialized<StagesC, 
+                                   StagesD, 
+                                   FragmentSize,
+                                   ReuseSmemC, 
+                                   DelayTmaStore, 
+                                   NumEpilogueWarpGroups>> 
+    : cute::true_type {};
+
 template<class DispatchPolicy>
 static constexpr bool sm90_is_ptr_array_tma_dispatch_policy_v = sm90_is_ptr_array_tma_dispatch_policy<DispatchPolicy>::value;
 
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
index 77a3b510..0ed7d6b9 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
@@ -144,7 +144,6 @@ private:
   static_assert(StagesD >= 1, "StagesD must be >= 1");
   
   constexpr static bool ReuseSmemC = ReuseSmemC_ && is_destination_supported;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
 
   constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
   constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
@@ -172,6 +171,12 @@ private:
   constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
   constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
 
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+  // TMA store delay only benefits with loop unrolling
+  constexpr static bool DelayTmaStore = DelayTmaStore_ and UnrollEpiLoop;
+
   struct CollectiveStorageWithC {
     alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
     alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
@@ -860,10 +865,12 @@ public:
         synchronize();
       }
       // For each epilogue subtile within the CTA tile
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_n = 0; iter_n < size<3>(gD_epi); ++iter_n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int iter_m = 0; iter_m < size<2>(gD_epi); ++iter_m) {
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
           int epi_m = iter_m, epi_n = iter_n;
           bool is_first_iteration = iter_m == 0 && iter_n == 0;
           bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
@@ -1215,10 +1222,12 @@ public:
     }
 
     // For each epilogue subtile within the CTA tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter_n = 0; iter_n < size<3>(gD_epi); ++iter_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_m = 0; iter_m < size<2>(gD_epi); ++iter_m) {
+    constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+    constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+    #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+    for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+      for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
         int epi_m = iter_m, epi_n = iter_n;
         bool is_first_iteration = iter_m == 0 && iter_n == 0;
         bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
@@ -1478,16 +1487,23 @@ public:
   tensormaps_cp_fence_release(
       TensorMapStorage& shared_tensormap,
       cute::TmaDescriptor const* tensormap) {
+    // Commit and wait for all TMA load/store instructions before updating the tensormap in gmem.
+    // This operation only happens when the group/batch changes between consecutive tiles.
+    // If there are no uncommitted instructions then tma_desc_commit_group results in an empty bulk async-group.
+    auto tma_desc_wait_all_fn = [] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (cute::elect_one_sync()) {
+        cute::tma_desc_commit_group();
+        cute::tma_desc_wait_group();
+      }
+    };
     // Entire warp must do this (ie its aligned)
     if constexpr (IsLoad) {
       if (is_source_supported) {
-        if (cute::elect_one_sync()) {
-          cute::tma_desc_commit_group();
-          cute::tma_desc_wait_group();
-        }
+        tma_desc_wait_all_fn();
         tma_descriptor_cp_fence_release(tensormap, shared_tensormap.smem_tensormap_C);
       }
     } else if constexpr (is_destination_supported) {
+      tma_desc_wait_all_fn();
       tma_descriptor_cp_fence_release(tensormap, shared_tensormap.smem_tensormap_D);
     }
   }
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
index 2eb5c582..c2172798 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
@@ -462,6 +462,10 @@ private:
                                                || is_same_v<ThreadEpilogueOp, epilogue::fusion::FusionOperation>; // alloc reduction buffer for custom EVTs
   constexpr static size_t ImplicitSharedStorageSize = IsReductionBufferNeeded ? size(EpilogueTile{}) : 0;
 
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+
 public:
   constexpr static int ThreadCount = 128;
   constexpr static uint32_t TmaTransactionBytes = 0;
@@ -669,10 +673,12 @@ public:
       static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
 
       // For each epilogue subtile within the CTA tile
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_n = 0; iter_n < size<4>(tTR_tAcc); ++iter_n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int iter_m = 0; iter_m < size<3>(tTR_tAcc); ++iter_m) {
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<4>(tTR_tAcc));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<3>(tTR_tAcc));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
           int epi_m = iter_m, epi_n = iter_n;
 
           bool is_last_iteration = iter_m == size<3>(tTR_tAcc)-1 && iter_n == size<4>(tTR_tAcc)-1;
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
index 89e5448c..3f445bf5 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
@@ -140,7 +140,6 @@ private:
   static_assert(StagesD >= 1, "StagesD must be >= 1");
   
   constexpr static bool ReuseSmemC = ReuseSmemC_;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
   constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
 
   constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
@@ -172,6 +171,12 @@ private:
   constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
   constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
 
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+  // TMA store delay only benefits with loop unrolling
+  constexpr static bool DelayTmaStore = DelayTmaStore_ and UnrollEpiLoop;
+
   struct CollectiveStorageWithC {
     alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
     alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
@@ -808,10 +813,12 @@ public:
       ConsumerToken acc_wait_token = acc_pipeline.consumer_try_wait(acc_pipe_consumer_state);
 
       // For each epilogue subtile within the CTA tile
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_n = 0; iter_n < size<3>(gD_epi); ++iter_n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int iter_m = 0; iter_m < size<2>(gD_epi); ++iter_m) {
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
           int epi_m = iter_m, epi_n = iter_n;
           bool is_first_iteration = iter_m == 0 && iter_n == 0;
           bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
@@ -1162,10 +1169,12 @@ public:
     }
 
     // For each epilogue subtile within the CTA tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter_n = 0; iter_n < size<3>(gD_epi); ++iter_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_m = 0; iter_m < size<2>(gD_epi); ++iter_m) {
+    constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+    constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+    #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+    for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+      for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
         int epi_m = iter_m, epi_n = iter_n;
         bool is_first_iteration = iter_m == 0 && iter_n == 0;
         bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
index b27ec712..41c95f16 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -41,6 +41,7 @@
 #include "cutlass/epilogue/thread/scale_type.h"
 #include "cutlass/epilogue/fusion/callbacks.hpp"
 #include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp"
 #include "cutlass/detail/collective.hpp"
 #include "cutlass/detail/layout.hpp"
 #include "cutlass/trace.h"
@@ -304,8 +305,9 @@ public:
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
     auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
-    auto init_M = get<0>(init_shape);
-    auto init_N = get<1>(init_shape);
+    constexpr int tma_alignment_bits = 128;
+    auto init_M = tma_alignment_bits;
+    auto init_N = tma_alignment_bits;
     auto init_L = get<3>(init_shape);
 
     static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
@@ -761,7 +763,14 @@ public:
 
     CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
 
+    if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+      // When the epilogue subtile is larger than the MMA tiles, loop over multiple MMA tiles
+      CUTE_STATIC_ASSERT(epi_tile_n % mma_tile_n == 0, "MMA_TILE_N must divide EPI_TILE_N");
+    }
+    else {
     CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    }
+
     // Get TiledCopy for partition reference when consumer store.
     TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
     // Get the fusion callbacks for the consumer store warps
@@ -784,6 +793,12 @@ public:
     bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
     bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
 
+    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
+    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                         // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
+
     // Thread synchronizer for previously issued waits or fences
     // to ensure visibility of smem reads/writes to threads or TMA unit
     auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
@@ -894,17 +909,41 @@ public:
           ++load_wait_state;
         }
 
-        int mma_m = epi_m;
-        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
-        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+        if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+          // When the epilogue subtile is larger than the MMA tiles, loop over multiple
+          // MMA tiles
+          static constexpr int MmaMPerEpiM = epi_tile_m / mma_tile_m;
+          static constexpr int MmaNPerEpiN = epi_tile_n / mma_tile_n;
 
-        // Vectorized fragment loop with visitor callback entry point
-        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
-        int r2s_v = epi_n_in_mma * size(tRS_rD_frg);
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size(tRS_rD_frg); ++epi_v) {
-          tRS_rD_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_n_in_epi = 0; mma_n_in_epi < MmaNPerEpiN; ++mma_n_in_epi) {
+            int mma_n = (epi_n * MmaNPerEpiN) + mma_n_in_epi;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m_in_epi = 0; mma_m_in_epi < MmaMPerEpiM; ++mma_m_in_epi) {
+              int mma_m = (epi_m * MmaMPerEpiM) + mma_m_in_epi;
+              Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+              int idx_in_epi_subtile = (mma_n_in_epi * MmaMPerEpiM + mma_m_in_epi);
+
+              tRS_rCompute_frg(idx_in_epi_subtile) = cst_callbacks.visit(
+                tRS_rAcc_frg_mn(0), idx_in_epi_subtile, epi_m, epi_n);
+            }
+          }
         }
+        else {
+          int mma_m = epi_m;
+          int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+          Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+          // Vectorized fragment loop with visitor callback entry point
+          int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+          int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+            tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          }
+        }
+
         // The latest we can delay the TMA store is right before the smem store of the next iteration
         // since the current TMA store needs to be committed before we can acquire the next smem buffer
         if constexpr (DelayTmaStore) {
@@ -918,7 +957,7 @@ public:
 
         // Smem reduction callback entry point using current store buffer for workspace
         cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
-                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
 
         // Copy tile from register to regiser if needed
         if constexpr (IsUseR2R) {
@@ -930,6 +969,11 @@ public:
           copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
         }
 
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rD_frg); ++i) {
+          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
+        }
+        
         // Copy tile from register to smem
         if constexpr (is_destination_supported) {
           copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
@@ -1140,7 +1184,6 @@ public:
       ProblemShape_MNKL problem_shape_mnkl,
       int32_t next_batch,
       int32_t warp_group_idx) {
-
     if (cute::elect_one_sync()) {
       // Replacing global_address for the next batch
       tensormaps_replace_global_address<IsLoad>(shared_tensormaps, params, next_batch, warp_group_idx);
@@ -1161,14 +1204,24 @@ public:
       TensorMapStorage& shared_tensormaps,
       cute::TmaDescriptor const* tensormap,
       const int32_t warp_group_idx = 0) {
-
+    // Commit and wait for all TMA load/store instructions before updating the tensormap in gmem.
+    // This operation only happens when the group/batch changes between consecutive tiles.
+    // If there are no uncommitted instructions then tma_desc_commit_group results in an empty bulk async-group.
+    auto tma_desc_wait_all_fn = [] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (cute::elect_one_sync()) {
+        cute::tma_desc_commit_group();
+        cute::tma_desc_wait_group();
+      }
+    };
     // Entire warp must do this (ie its aligned)
     if constexpr (IsLoad) {
       if constexpr (is_source_supported) {
+        tma_desc_wait_all_fn();
         tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_C);
       }
     }
     else if constexpr (is_destination_supported) {
+      tma_desc_wait_all_fn();
       tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_D[warp_group_idx]);
     }
   }
diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp
index 870be4c2..2e6213fe 100644
--- a/include/cutlass/epilogue/dispatch_policy.hpp
+++ b/include/cutlass/epilogue/dispatch_policy.hpp
@@ -255,6 +255,23 @@ struct Sm120TmaWarpSpecialized {
   constexpr static bool DelayTmaStore = DelayTmaStore_;
 };
 
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_
+>
+struct Sm120PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+};
+
 //////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::epilogue
diff --git a/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
index 8f391aac..b769b1f0 100644
--- a/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
@@ -1317,6 +1317,277 @@ struct FusionCallbacks<
   using Impl::Impl;
 };
 
+// Sm120 Ptr array tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// For Ptr-Array and Grouped GEMM
+// D = alpha * acc + beta * C, where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombRowBlockScaleFactorPtrArray =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor *, RoundStyle
+    >, // gen scalefactor
+    Sm90LinearCombinationPtrArray< ElementCompute, ElementCompute, 
+      ElementSource, ElementScalar, RoundStyle
+    > // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    fusion::LinCombBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinearCombRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinearCombRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+
+// For Ptr-Array and Grouped GEMM
+// D = activation(alpha * acc + beta * C), where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombEltActRowBlockScaleFactorPtrArray =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor *, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombEltActPtrArray<ActivationFn, ElementCompute, ElementCompute, 
+      ElementSource, ElementScalar, RoundStyle
+    > // activation(beta * C + (alpha * acc))
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    fusion::LinCombEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombEltActRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombEltActRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
 } // namespace cutlass::epilogue::fusion
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
index 59a9d030..e72e971b 100644
--- a/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
@@ -94,6 +94,8 @@ struct Sm120BlockScaleFactorRowStore {
 
   using Params = Arguments;
 
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
   template <class ProblemShape>
   static constexpr Params
   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
@@ -390,21 +392,21 @@ struct Sm120BlockScaleFactorRowStore {
         }
 
         ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
-        ElementBlockScaleFactor qpvscale = NumericConverter<ElementBlockScaleFactor, ElementCompute>{}(pvscale);
+        UnderlyingElementBlockScaleFactor qpvscale = NumericConverter<UnderlyingElementBlockScaleFactor, ElementCompute>{}(pvscale);
         tC_rSFD_flt(coord) = qpvscale;
 
         //
         // Apply the scale factor to the output
         //
         ElementCompute qpvscale_rcp = [&]() {
-          if constexpr (cute::is_same_v<ElementBlockScaleFactor, float_ue8m0_t>) {
+          if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
             // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
-            auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<ElementBlockScaleFactor>{}(qpvscale);
-            return cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+            auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<UnderlyingElementBlockScaleFactor>{}(qpvscale);
+            return cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
           }
           else {
             // UE4M3: Do the rcp in fp32 data type.
-            auto qpvscale_up = cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(qpvscale);
+            auto qpvscale_up = cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(qpvscale);
             return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
           }
         }();
@@ -458,15 +460,24 @@ struct Sm120BlockScaleFactorRowStore {
     auto [M, N, K, L] = args.problem_shape_mnkl;
     auto [m, n, k, l] = args.tile_coord_mnkl;
     using Sm1xxBlockScaledOutputConfig = cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[l];
+      l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
 
     auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
-    Tensor mSFD = make_tensor(make_gmem_ptr(params_ptr->ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
 
     static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
     Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));                             // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
     Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
                         gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrSFD = make_tensor_like<ElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
 
     auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
 
@@ -537,6 +548,8 @@ struct Sm120BlockScaleFactorColStore {
   };
   using Params = Arguments;
 
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
   template <class ProblemShape>
   static constexpr Params
   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
@@ -770,21 +783,21 @@ struct Sm120BlockScaleFactorColStore {
           synchronize();
 
           ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
-          ElementBlockScaleFactor qpvscale = NumericConverter<ElementBlockScaleFactor, ElementCompute>{}(pvscale);
+          UnderlyingElementBlockScaleFactor qpvscale = NumericConverter<UnderlyingElementBlockScaleFactor, ElementCompute>{}(pvscale);
           filter(tC_rSFD)(sf_id + mma_in_epi*ColsPerThreadAccFrag) = qpvscale;
 
           //
           // Apply the scale factor to the output
           //
           ElementCompute qpvscale_rcp = [&]() {
-            if constexpr (cute::is_same_v<ElementBlockScaleFactor, float_ue8m0_t>) {
+            if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
               // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
-              auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<ElementBlockScaleFactor>{}(qpvscale);
-              return cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+              auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<UnderlyingElementBlockScaleFactor>{}(qpvscale);
+              return cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
             }
             else {
               // UE4M3: Do the rcp in fp32 data type.
-              auto qpvscale_up = cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(qpvscale);
+              auto qpvscale_up = cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(qpvscale);
               return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
             }
           }();
@@ -829,18 +842,27 @@ struct Sm120BlockScaleFactorColStore {
     auto [M, N, K, L] = args.problem_shape_mnkl;
     auto [m, n, k, l] = args.tile_coord_mnkl;
     using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize, UMMA::Major::MN>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[l];
+      l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
 
     static_assert(size<0>(EpilogueTile{}) && ((size<0>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0),
       "Epilogue Tile N should be pow of 2");
 
     auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
-    Tensor mSFD = make_tensor(make_gmem_ptr(params_ptr->ptr_scale_factor),
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor),
                     Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
 
     Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));               // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
     Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
                       gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrSFD = make_tensor_like<ElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
 
     auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
 
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 33c5585f..c3abfdff 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -52,6 +52,18 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
+template<class Op, class Enable = void>
+struct kIsHeavy_member_or_false {
+  static constexpr bool value = false;
+};
+template<class Op>
+struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
+  static constexpr bool value = Op::kIsHeavy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 // Identity operator
 template <typename T>
 struct Identity {
@@ -113,6 +125,8 @@ template <template <class> class Activation, typename T>
 struct Scale<Activation<T>> {
   using Arguments = typename Scale<T>::Arguments;
 
+  static const bool kIsHeavy = Activation<T>::kIsHeavy;
+
   CUTLASS_HOST_DEVICE
   T operator()(T value, typename Arguments::scale_type scale) const {
     multiplies<T> mul;
@@ -127,21 +141,22 @@ struct Scale<Activation<T>> {
 };
 
 /// ReLu operator - propagates NaNs
-/// Always put threshold in the right hand side of max to propagate NaN.
 template <typename T>
 struct ReLu {
   static const bool kIsHeavy = false;
 
   CUTLASS_HOST_DEVICE
   T operator()(T threshold, T value) const {
-    maximum<T> mx;
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
 
     return mx(value, threshold);
   }
 
   CUTLASS_HOST_DEVICE
   T operator()(T value) const {
-    maximum<T> mx;
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
 
     return mx(value, T(0));
   }
@@ -156,14 +171,16 @@ struct ReLu<Array<T, N>> {
 
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
-    maximum<Array<T, N>> mx;
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T, N>, PropagateNaN> mx;
 
     return mx(frag, threshold);
   }
 
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &frag) const {
-    maximum<Array<T, N>> mx;
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T, N>, PropagateNaN> mx;
     return mx(frag, T(0));
   }
 };
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
index 0b8d07a6..0b6aa714 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -53,21 +53,6 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
-namespace { // (anonymous)
-template<class Op, class Enable = void>
-struct kIsHeavy_member_or_false {
-  static constexpr bool value = false;
-};
-template<class Op>
-struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
-  static constexpr bool value = Op::kIsHeavy;
-};
-
-} // namespace (anonymous)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 namespace detail {
 
 struct EmptyArguments {};
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index 343a22a4..574202ee 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -63,6 +63,10 @@
 #  define CUDA_PTX_UE8M0_CVT_ENABLED 1
 #endif
 
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  define CUDA_PTX_UE8M0_CVT_ENABLED 1
+#endif
 
 #ifdef __GNUC__
 // Ignore checks on reinterpret-casts that are being used for bitcasts.
diff --git a/include/cutlass/float_subbyte.h b/include/cutlass/float_subbyte.h
index b6c7a891..547714b7 100644
--- a/include/cutlass/float_subbyte.h
+++ b/include/cutlass/float_subbyte.h
@@ -48,6 +48,12 @@
      defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
 #endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
+#endif
+
 #include "cutlass/cutlass.h"
 #include "cutlass/exmy_base.h"
 
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 645cddcd..5a9a9888 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -54,7 +54,7 @@
 #include <intrin.h>
 #endif // _MSC_VER
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
 #  define CUTLASS_ARCH_CREDUX_ENABLED
 #endif
 
diff --git a/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
index d2650371..3edd9280 100644
--- a/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
@@ -60,16 +60,18 @@ sm100_compute_stage_count_or_override_fast_fp32(StageCountAutoCarveout<carveout_
   static_assert(CtaN <= 128, "Can't support CtaN>128 tiles");
   constexpr int CtaK = get<2>(CtaTileShape_MNK{});
   using AtomThrID = typename TiledMma::AtomThrID;
+  constexpr int TmemColumns = 512;
+
   // Detect 2x2 TMEM layout
   constexpr int TmemAccWordsPerDP = (CtaM == 64 && size(AtomThrID{}) == 2) ? CtaN/2 : CtaN;
   constexpr int TmemAWordsPerDP = ComplexComponent * NumComputeMtxs * CtaK / 2;
   constexpr bool IsAComputeinTmem = UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>;
   constexpr bool IsAComputeinSmem = !IsAComputeinTmem;
-  constexpr int AccumulatorStageCount = (IsAComputeinTmem) ? (((TmemAccWordsPerDP * ComplexComponent == 128) ? 2 : 3) * ComplexComponent) : (512 / TmemAccWordsPerDP);
+  constexpr int AccumulatorStageCount = (IsAComputeinTmem) ? (((TmemAccWordsPerDP * ComplexComponent == 128) ? 2 : 3) * ComplexComponent) : (TmemColumns / TmemAccWordsPerDP);
   
   constexpr int SmemCapacityAfterMma2AccumCarveout = CapacityBytes - (carveout_bytes + AccumulatorStageCount * 32);
 
-  constexpr int TmemInAStageCount_Potential = (IsAComputeinTmem) ? (512 - AccumulatorStageCount * TmemAccWordsPerDP) / TmemAWordsPerDP : 10000;
+  constexpr int TmemInAStageCount_Potential = (IsAComputeinTmem) ? (TmemColumns - AccumulatorStageCount * TmemAccWordsPerDP) / TmemAWordsPerDP : 10000;
   
   constexpr auto load2transform_pipeline_bytes = sizeof(typename cutlass::PipelineTmaTransformAsync<1>::SharedStorage);
   constexpr auto a_bits = cute::sizeof_bits_v<float> * ComplexComponent;
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
index 6fd9c15f..af122b4d 100644
--- a/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
@@ -235,9 +235,11 @@ struct CollectiveBuilder<
 
   static constexpr int MMA_N = cute::size<1>(TileShape_MNK{});
   static constexpr uint32_t AccumulatorPipelineStageCount = (MMA_N == 256) ? 1 : 2;
-  // Grouped GEMM (where Stride type is Stride*) does not use CLC based scheduler.
-  static constexpr uint32_t SchedulerPipelineStageCount = 1;
   static constexpr bool IsArrayOfPointersGemm = cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, BuilderScheduleTag>;
+  // Grouped GEMM(where Stride type is Stride*) uses specific static tile scheduler.  
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+  static constexpr uint32_t SchedulerPipelineStageCount = cute::conditional_return<IsGroupGemm>(8, 1);
+
   static constexpr uint32_t KernelSmemCarveout = detail::Sm100DenseGemmTmaUmmaCarveout<
       ClusterShape_MNK,
       AccumulatorPipelineStageCount,
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
index 7b47c1f9..dc99eb3d 100644
--- a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
@@ -49,9 +49,8 @@ template<
   class ElementScalar,
   class ScaleShapeMNK,
   class TileShapeMNK,
-  class MainloopPipelineStorage,
-  class TransformLoadPipelineStorage,
-  class TransformPipelineStorage,
+  class MainloopABPipelineStorage,
+  class MainloopSFPipelineStorage,
   int stages
 >
 constexpr int
@@ -67,9 +66,8 @@ template<
   class ElementScalar,
   class ScaleShapeMNK,
   class TileShapeMNK,
-  class MainloopPipelineStorage,
-  class TransformLoadPipelineStorage,
-  class TransformPipelineStorage,
+  class MainloopABPipelineStorage,
+  class MainloopSFPipelineStorage,
   int stages
 >
 constexpr int
@@ -85,9 +83,8 @@ template<
   class ElementScalar,
   class ScaleShapeMNK,
   class TileShapeMNK,
-  class MainloopPipelineStorage,
-  class TransformLoadPipelineStorage,
-  class TransformPipelineStorage,
+  class MainloopABPipelineStorage,
+  class MainloopSFPipelineStorage,
   int carveout_bytes>
 constexpr int
 sm100_compute_stage_count_or_override_blockwise(StageCountAutoCarveout<carveout_bytes> stage_count) {
@@ -96,18 +93,20 @@ sm100_compute_stage_count_or_override_blockwise(StageCountAutoCarveout<carveout_
   // Each stage include (CollectiveMma::SharedStorage)
   // 1. smem for A and smem for B (CollectiveMma::SharedStorage::TensorStorage)
   // 2. one of each of the pipelines
-  constexpr auto pipeline_bytes = sizeof(MainloopPipelineStorage) + 
-      sizeof(TransformLoadPipelineStorage) + sizeof(TransformPipelineStorage);
+  constexpr auto pipeline_bytes = sizeof(MainloopABPipelineStorage) + 
+      sizeof(MainloopSFPipelineStorage);
 
   constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
   constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
   constexpr auto scale_bits = cute::sizeof_bits_v<ElementScalar>;
 
   constexpr int stage_bytes =
-    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(scale_bits * size<0>(ScaleShapeMNK{}) * size<2>(ScaleShapeMNK{})) +
-    cutlass::bits_to_bytes(scale_bits * size<1>(ScaleShapeMNK{}) * size<2>(ScaleShapeMNK{})) +
+    cutlass::round_nearest(
+      cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(scale_bits * size<0>(ScaleShapeMNK{}) * size<2>(ScaleShapeMNK{})) +
+      cutlass::bits_to_bytes(scale_bits * size<1>(ScaleShapeMNK{}) * size<2>(ScaleShapeMNK{})),
+      128) +
     static_cast<int>(pipeline_bytes);
 
   return (CapacityBytes - carveout_bytes) / stage_bytes;
@@ -369,9 +368,8 @@ struct CollectiveBuilder<
   static constexpr int Sm100ReducedSmemCapacityBytes = cutlass::gemm::collective::detail::sm100_smem_capacity_bytes - KernelSmemCarveout;
 
   using SmemTileShape = cute::Shape<BlockTileA_M, BlockTileB_N, BlockTileA_K>;
-  using MainloopPipelineStorage = typename cutlass::PipelineTmaUmmaAsync<1>::SharedStorage;
-  using TransformLoadPipelineStorage = typename cutlass::PipelineAsync<1>::SharedStorage;
-  using TransformPipelineStorage = typename cutlass::PipelineUmmaAsync<1>::SharedStorage;
+  using MainloopABPipelineStorage = typename cutlass::PipelineTmaUmmaAsync<1>::SharedStorage;
+  using MainloopSFPipelineStorage = typename cutlass::PipelineAsync<1>::SharedStorage;
 
   static constexpr int ScaleGranularityM = size<0,0>(cute::remove_pointer_t<GmemLayoutSFATag>{});
   static constexpr int ScaleGranularityN = size<0,0>(cute::remove_pointer_t<GmemLayoutSFBTag>{});
@@ -398,8 +396,8 @@ struct CollectiveBuilder<
 
   static constexpr int PipelineStages = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_blockwise<
       Sm100ReducedSmemCapacityBytes, ElementAMma_SmemAllocType, ElementBMma_SmemAllocType, 
-      ElementAccumulator, ScaleTileShape, SmemTileShape, MainloopPipelineStorage,
-      TransformLoadPipelineStorage, TransformPipelineStorage>(StageCountType{});
+      ElementAccumulator, ScaleTileShape, SmemTileShape, MainloopABPipelineStorage,
+      MainloopSFPipelineStorage>(StageCountType{});
   static_assert(PipelineStages > 0, "Smem usage is too high. Can't create any SMEM buffers for A, B, and scales.");
 
   using DispatchPolicy = cute::conditional_t<
diff --git a/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
index 4cfff447..4ebfdb52 100644
--- a/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
@@ -264,10 +264,11 @@ struct CollectiveBuilder<
   // Calculate scheduler pipeline stages. Having one more stage than the accumulator allows more latency hiding.
   using StrideA = cutlass::gemm::TagToStrideA_t<GmemLayoutATag>;
   using InternalStrideA  = cute::remove_pointer_t<StrideA>;
-  // Grouped GEMM (where Stride type is Stride*) does not use CLC based scheduler.
-  // SchedulerPipelineStageCount could be set to zero for Grouped GEMM, but we shouldn't define CLC Pipeline's barrier arrays of size zero.
-  static constexpr uint32_t SchedulerPipelineStageCount = 1;
   static constexpr bool IsArrayOfPointersGemm = (cute::is_base_of_v<KernelScheduleSm100PtrArrayDenseGemm, BuilderScheduleTag>);
+  // Grouped GEMM(where Stride type is Stride*) uses specific static tile scheduler.
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+  static constexpr uint32_t SchedulerPipelineStageCount = cute::conditional_return<IsGroupGemm>(8, 1);
+  
   static constexpr uint32_t KernelSmemCarveout = detail::Sm100DenseGemmTmaUmmaCarveout<
       ClusterShape_MNK,
       AccumulatorPipelineStageCount,
diff --git a/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
index 24f8c202..862d4302 100755
--- a/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
@@ -69,6 +69,8 @@ struct CollectiveBuilder<
       (cute::is_base_of_v<KernelScheduleBlockScaledGemmSm120, BuilderScheduleTag> ||
        cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
        cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
        cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)
        &&
       // Alignment check
@@ -90,6 +92,7 @@ struct CollectiveBuilder<
 
   static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
   static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+  static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "Only TN layout is supported.");
 
   static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
   static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
@@ -109,7 +112,8 @@ struct CollectiveBuilder<
   using PermTileK = cute::conditional_t<(UseMxf8f6f4
                                         ), _32, _64>;
 
-  static constexpr bool IsCooperative = !cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag>;
+  static constexpr bool IsCooperative = !(cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
+                                          cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, BuilderScheduleTag>);
   // Data type used by MMA instruction
   using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
   using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
@@ -210,17 +214,50 @@ struct CollectiveBuilder<
 
   static constexpr uint32_t SchedulerPipelineStageCount = 3;
 
-  using DispatchPolicy = MainloopSm120TmaWarpSpecializedBlockScaled<PipelineStages,
+  using StrideA = cutlass::gemm::TagToStrideA_t<GmemLayoutATag>;
+  using StrideB = cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>;
+  using InternalStrideA  = cute::remove_pointer_t<StrideA>;
+  using InternalStrideB  = cute::remove_pointer_t<StrideB>;
+  using InternalLayoutSFA = decltype(Sm1xxBlkScaledConfig::deduce_layoutSFA());
+  using InternalLayoutSFB = decltype(Sm1xxBlkScaledConfig::deduce_layoutSFB());
+  using LayoutSFA = cute::conditional_t<cute::is_same_v<InternalStrideA, StrideA>, InternalLayoutSFA, InternalLayoutSFA *>;
+  using LayoutSFB = cute::conditional_t<cute::is_same_v<InternalStrideB, StrideB>, InternalLayoutSFB, InternalLayoutSFB *>;
+  using StridePairA = decltype(cute::make_tuple(StrideA{}, LayoutSFA{}));
+  using StridePairB = decltype(cute::make_tuple(StrideB{}, LayoutSFB{}));
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  static_assert(!IsGroupedGemmKernel || 
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+                cute::is_base_of_v<KernelScheduleAuto, BuilderScheduleTag> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, BuilderScheduleTag>,
+                "Invalid builder schedule tag for grouped GEMM");
+
+  using KernelSchedule = cute::conditional_t<IsGroupedGemmKernel, 
+                                              // PtrArray
+                                              cute::conditional_t<IsCooperative, 
+                                                KernelPtrArrayTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount>, 
+                                                KernelPtrArrayTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount>>,
+                                              // Non-PtrArray
+                                              cute::conditional_t<IsCooperative, 
+                                                KernelTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount>, 
+                                                KernelTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount>>>;
+
+  using DispatchPolicy = cute::conditional_t<IsGroupedGemmKernel,
+                                              MainloopSm120ArrayTmaWarpSpecializedBlockScaled<PipelineStages,
                                                                     SchedulerPipelineStageCount,
                                                                     ClusterShape_MNK,
-                                                                    BuilderScheduleTag>;
+                                                                    KernelSchedule>,
+                                              MainloopSm120TmaWarpSpecializedBlockScaled<PipelineStages,
+                                                                    SchedulerPipelineStageCount,
+                                                                    ClusterShape_MNK,
+                                                                    KernelSchedule>>;
+                                                                    
   static_assert(cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename DispatchPolicy::Schedule> ||
-                cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename DispatchPolicy::Schedule>, 
+                cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename DispatchPolicy::Schedule> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename DispatchPolicy::Schedule> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename DispatchPolicy::Schedule>, 
                 "Unsupported kernel schedule by this collective mainloop dispatch policy.");
 
-  using StridePairA = decltype(cute::make_tuple(cutlass::gemm::TagToStrideA_t<GmemLayoutATag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFA()));
-  using StridePairB = decltype(cute::make_tuple(cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFB()));
-
   using CollectiveOp = CollectiveMma<
       DispatchPolicy,
       TileShape_MNK,
diff --git a/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl
index 25a06e50..5426aa4c 100644
--- a/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl
@@ -70,6 +70,8 @@ struct CollectiveBuilder<
       (cute::is_base_of_v<KernelScheduleSm120DenseGemm, BuilderScheduleTag> ||
        cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
        cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
        cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>) &&
       // Alignment check
       detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, BuilderScheduleTag>()>>
@@ -79,9 +81,11 @@ struct CollectiveBuilder<
                 "SM120 TmaWarpSpecialized builder currently only supports F8F6F4 MMA.");
   static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
   static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
+  static_assert(cute::size(ClusterShape_MNK{}) == Int<1>{}, "no programmatic multicast on this arch");
 
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
+  static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+  static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "Only TN layout is supported.");
 
   using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
   using PermTileN = decltype(cute::min(size<1>(TileShape_MNK{}),  _32{}));
@@ -127,10 +131,24 @@ struct CollectiveBuilder<
       detail::sm120_smem_capacity_bytes, SmemAllocTypeA,
       SmemAllocTypeB, TileShape_MNK, MainloopPipelineStorage>(StageCountType{});
   static constexpr uint32_t SchedulerPipelineStageCount = 2;
+
+
+  static constexpr bool IsPtrArrayKernel = cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+                                           cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, BuilderScheduleTag>;
+  static_assert(!IsPtrArrayKernel, "PtrArray kernel is not supported for this collective builder.");
+  
+  using KernelSchedule = cute::conditional_t<IsCooperative, 
+                                              KernelTmaWarpSpecializedCooperativeSm120<SchedulerPipelineStageCount>, 
+                                              KernelTmaWarpSpecializedPingpongSm120<SchedulerPipelineStageCount>>;
+
   using DispatchPolicy = MainloopSm120TmaWarpSpecialized<PipelineStages,
-                                                         SchedulerPipelineStageCount,
-                                                         ClusterShape_MNK,
-                                                         BuilderScheduleTag>;
+                                                          SchedulerPipelineStageCount,
+                                                          ClusterShape_MNK,
+                                                          KernelSchedule>;
+
+  static_assert(cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename DispatchPolicy::Schedule> ||
+                cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename DispatchPolicy::Schedule>, 
+                "Unsupported kernel schedule by this collective mainloop dispatch policy.");                                                                    
 
   using SmemCopyAtomA = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_A<ElementA, ElementB, UseF8f6f4>()), SmemAllocTypeA>;
   using SmemCopyAtomB = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_B<ElementA, ElementB, UseF8f6f4>()), SmemAllocTypeB>;
diff --git a/include/cutlass/gemm/collective/builders/sm1xx_common.inl b/include/cutlass/gemm/collective/builders/sm1xx_common.inl
index 6cdd76d2..a6444e02 100644
--- a/include/cutlass/gemm/collective/builders/sm1xx_common.inl
+++ b/include/cutlass/gemm/collective/builders/sm1xx_common.inl
@@ -502,6 +502,9 @@ check_input_datatypes() {
             || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledGemmSm120>)
             || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedPingpong>)
             || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedCooperative>)
+            // SM120 BS ptr_array
+            || (cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecializedPingpong>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecializedCooperative>)
             // SM120 BSSP
             || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledSparseGemmSm120>)
             );
diff --git a/include/cutlass/gemm/collective/builders/sm90_common.inl b/include/cutlass/gemm/collective/builders/sm90_common.inl
index 3a7bb842..b1f4f1fc 100644
--- a/include/cutlass/gemm/collective/builders/sm90_common.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_common.inl
@@ -389,20 +389,44 @@ is_input_fp8() {
 
 // We need to handle the tuples in this function since it is used in SFINAE dispatch in the CollectiveBuilder.
 // At that point, it is not guaranteed that the tuples have been split out into the required parts.
-template <class MaybeTupleElementA, class LayoutA, class MaybeTupleElementB, class LayoutB>
+template <class MaybeTupleElementA, class MaybePairLayoutA, class MaybeTupleElementB, class MaybePairLayoutB>
 constexpr bool
 is_use_rmem_A() {
 
-  using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
-  using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
+  // Handle the case we get a pair of layouts. We expect one of them to be an actual cute layout
+  if constexpr (cute::is_tuple_v<MaybePairLayoutA> and
+                cute::is_tuple_v<MaybePairLayoutB>) {
+    if constexpr ((cute::is_layout<cute::remove_pointer_t<cute::tuple_element_t<0, MaybePairLayoutA>>>::value or
+                   cute::is_layout<cute::remove_pointer_t<cute::tuple_element_t<1, MaybePairLayoutA>>>::value) and 
+                  (cute::is_layout<cute::remove_pointer_t<cute::tuple_element_t<0, MaybePairLayoutB>>>::value or
+                   cute::is_layout<cute::remove_pointer_t<cute::tuple_element_t<1, MaybePairLayoutB>>>::value)) {
+      return is_use_rmem_A<MaybeTupleElementA, cute::tuple_element_t<0, MaybePairLayoutA>,
+                           MaybeTupleElementB, cute::tuple_element_t<0, MaybePairLayoutB>>();
+    } else {
+      using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
+      using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
 
-  constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
-  constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
-  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
-  constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
-                                cutlass::gemm::detail::is_k_major_B<LayoutB>();
-  constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
-  return IsUseRmemA;
+      constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
+      constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
+      constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+      constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<MaybePairLayoutA>() &&
+                                  cutlass::gemm::detail::is_k_major_B<MaybePairLayoutB>();
+      constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
+      return IsUseRmemA;
+    }
+  } 
+  else {
+    using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
+    using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
+
+    constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
+    constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
+    constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+    constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<MaybePairLayoutA>() &&
+                                  cutlass::gemm::detail::is_k_major_B<MaybePairLayoutB>();
+    constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
+    return IsUseRmemA;
+  }
 }
 
 template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, int RequiredAlignment>
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
index 01c78f5a..42848620 100644
--- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -1037,10 +1037,10 @@ static constexpr bool IsMixedWidthInput = IsDifferentWidth || (IsDifferentWidth
 // GMMA_TMA_WS_SS (BlockScaled Builders)
 template <
   class ElementA,
-  class GmemLayoutATag,
+  class GmemLayoutPairA,
   int AlignmentA,
   class ElementB,
-  class GmemLayoutBTag,
+  class GmemLayoutPairB,
   int AlignmentB,
   class ElementAccumulator,
   class TileShape_MNK,
@@ -1052,10 +1052,10 @@ struct CollectiveBuilder<
     arch::Sm90,
     arch::OpClassTensorOp,
     ElementA,
-    GmemLayoutATag,
+    GmemLayoutPairA,
     AlignmentA,
     ElementB,
-    GmemLayoutBTag,
+    GmemLayoutPairB,
     AlignmentB,
     ElementAccumulator,
     TileShape_MNK,
@@ -1063,14 +1063,27 @@ struct CollectiveBuilder<
     StageCountType,
     KernelScheduleType,
     cute::enable_if_t<
-      cute::is_same_v<decltype(KernelScheduleType::ScaleGranularityM), decltype(KernelScheduleType::ScaleGranularityN)> and
-      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum> or
+       cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum> or
+       cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum>) and
+      not detail::is_use_rmem_A<ElementA, GmemLayoutPairA, ElementB, GmemLayoutPairB>()
     >
 > {
+  using GmemLayoutATag   = cute::remove_cvref_t<decltype(get<0>(GmemLayoutPairA{}))>;
+  using GmemLayoutSFATag = cute::remove_cvref_t<decltype(get<1>(GmemLayoutPairA{}))>;
+  using GmemLayoutBTag   = cute::remove_cvref_t<decltype(get<0>(GmemLayoutPairB{}))>;
+  using GmemLayoutSFBTag = cute::remove_cvref_t<decltype(get<1>(GmemLayoutPairB{}))>;
 
-  static constexpr auto ScaleGranularityM_ = KernelScheduleType::ScaleGranularityM;
-  static constexpr auto ScaleGranularityN_ = KernelScheduleType::ScaleGranularityN;
-  static constexpr auto ScalePromotionInterval_ = KernelScheduleType::ScalePromotionInterval;
+  static_assert(cute::depth(cute::remove_pointer_t<GmemLayoutSFATag>{}) == 2 and 
+                cute::depth(cute::remove_pointer_t<GmemLayoutSFBTag>{}) == 2, 
+      "Expect SFA and SFB layout to be depth of two with shape ((SFVecMN, restMN),(SFVecK, restK), L)");
+  static_assert(size<1,0>(cute::remove_pointer_t<GmemLayoutSFATag>{}) == 
+                size<1,0>(cute::remove_pointer_t<GmemLayoutSFBTag>{}), 
+      "SFA and SFB must have equivalent SF vector sizes along K");
+
+  static constexpr auto ScaleGranularityM = size<0,0>(cute::remove_pointer_t<GmemLayoutSFATag>{});
+  static constexpr auto ScaleGranularityN = size<0,0>(cute::remove_pointer_t<GmemLayoutSFBTag>{});
+  static constexpr auto ScaleGranularityK = size<1,0>(cute::remove_pointer_t<GmemLayoutSFATag>{});
 
   static_assert(is_static<TileShape_MNK>::value);
   static_assert(is_static<ClusterShape_MNK>::value);
@@ -1113,10 +1126,10 @@ struct CollectiveBuilder<
       GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
 
   static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  // Reserve 128B for 8 stages of tile scheduling
+  static constexpr size_t TileSchedulerCarveout = IsArrayOfPointersGemm ? 128 : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage) + static_cast<int>(TileSchedulerCarveout);
 
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape_MNK{}) : ScaleGranularityM_;
-  static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape_MNK{}) : ScaleGranularityN_;
   static constexpr int ScaleMsPerTile = size<0>(TileShape_MNK{}) / ScaleGranularityM;
   static constexpr int ScaleNsPerTile = size<1>(TileShape_MNK{}) / ScaleGranularityN;
   static_assert((size<0>(TileShape_MNK{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
@@ -1125,8 +1138,8 @@ struct CollectiveBuilder<
   static constexpr int PipelineStages = detail::compute_stage_count_with_blockwise_scale<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
       ElementAMma, ElementBMma, ElementBlockScale, TileShape_MNK, ScaleMsPerTile, ScaleNsPerTile>(StageCountType{});
   using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
-    MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>,
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>>;
+    MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
 
   using SmemCopyAtomA = void;
   using SmemCopyAtomB = void;
@@ -1135,9 +1148,9 @@ struct CollectiveBuilder<
       DispatchPolicy,
       TileShape_MNK,
       ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
+      cute::tuple<TagToStrideA_t<GmemLayoutATag>, TagToStrideA_t<GmemLayoutSFATag>>,
       ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
+      cute::tuple<TagToStrideB_t<GmemLayoutBTag>, TagToStrideB_t<GmemLayoutSFBTag>>,
       TiledMma,
       GmemTiledCopyA,
       SmemLayoutAtomA,
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index 57b34afd..5a1e93eb 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -64,6 +64,7 @@
 #include "cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp"
 #include "cutlass/gemm/collective/sm120_mma_tma.hpp"
 #include "cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp"
 #include "cutlass/gemm/collective/sm120_sparse_mma_tma.hpp"
 #include "cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp"
 #endif // !defined(__CUDACC_RTC__) 
diff --git a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
index 1d6e1158..bb05b52a 100644
--- a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
@@ -171,6 +171,7 @@ struct CollectiveMma<
       ScaleGranularityK, 
       size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K,
       size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K>;
+  
 
   using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(CtaShape_MNK{}));
   using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(CtaShape_MNK{}));
@@ -189,6 +190,9 @@ struct CollectiveMma<
   using TransformB = TransformB_;
   using ArchTag = typename DispatchPolicy::ArchTag;
 
+  static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
   using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
                                 DispatchPolicy::Stages,
                                 ClusterShape,
@@ -510,6 +514,7 @@ struct CollectiveMma<
     constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
 
     bool implementable = true;
+    bool implementable_sf = true;
     if (problem_shapes.is_host_problem_shape_available()) {
       // Check alignment for all problem sizes
       for (int i = 0; i < problem_shapes.groups(); i++) {
@@ -517,19 +522,21 @@ struct CollectiveMma<
         auto [M,N,K,L] = problem_shape_MNKL;
         implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
         implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
-        bool implementable_sf = cutlass::detail::check_alignment<sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator)>(InternalLayoutSFA{});
-        implementable_sf = implementable_sf && cutlass::detail::check_alignment<sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator)>(InternalLayoutSFB{});
-
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFA>(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFB>(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
         if (!implementable_sf) {
           CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
         }
-        implementable = implementable && implementable_sf;
       }
     }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
 
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
     }
+    implementable = implementable && implementable_sf;
     return implementable;
   }
 
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
index a8ebd512..c8a5367a 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
@@ -41,7 +41,7 @@
 #include "cutlass/trace.h"
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cutlass/detail/sm100_tmem_helper.hpp"
-#include "cutlass/detail/sm100_blockwise_scale_layout.hpp"
+#include "cutlass/detail/blockwise_scale_layout.hpp"
 
 #include "cute/algorithm/functional.hpp"
 #include "cute/arch/cluster_sm90.hpp"
@@ -201,6 +201,9 @@ struct CollectiveMma<
                                   AtomThrShapeMNK>;
   using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
 
+  static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
   // Two arrivals per thread in the warp (1 arrival and 1 arrival through cp.async.mbarrier)
   static constexpr int NumMainloopSFProducerThreadEvents = 64;
 
@@ -563,8 +566,8 @@ struct CollectiveMma<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
     }
 
-    bool implementable_sf = cutlass::detail::check_alignment<sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator)>(args.layout_SFA);
-    implementable_sf = implementable_sf && cutlass::detail::check_alignment<sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator)>(args.layout_SFB);
+    bool implementable_sf = cutlass::detail::check_alignment<AlignmentSFA>(args.layout_SFA);
+    implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFB>(args.layout_SFB);
 
     if (!implementable_sf) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
new file mode 100644
index 00000000..f8d1a00a
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
@@ -0,0 +1,824 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+
+#pragma once
+#include <cuda_bf16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+#include "cutlass/detail/cluster.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/arch/mma_sm100.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop for Mixed Input Kernels
+template <
+  int Load2TransformPipelineStageCount_,
+  int Transform2MmaPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomsA_,
+  class CopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomsB_,
+  class CopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedMixedInput<
+      Load2TransformPipelineStageCount_,
+      Transform2MmaPipelineStageCount_,
+      SchedulerPipelineStageCount_,
+      AccumulatorPipelineStageCount_,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomsA_,
+    CopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomsB_,
+    CopyAtomsB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedMixedInput<
+                            Load2TransformPipelineStageCount_,
+                            Transform2MmaPipelineStageCount_,
+                            SchedulerPipelineStageCount_,
+                            AccumulatorPipelineStageCount_,
+                            ClusterShape>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<ElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, ElementA>;
+
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale = cute::conditional_t<
+      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+  using CopyAtomsA = CopyAtomsA_;
+  using CopyAtomsB = CopyAtomsB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(sizeof(ElementA) < 2, "Matrix to be scaled should be provided in A otherwise input is not supported");
+  static_assert(cute::is_same_v<ElementAMma, cutlass::bfloat16_t> || cute::is_same_v<ElementAMma, cutlass::half_t> || cute::is_same_v<ElementAMma, cutlass::float_e4m3_t>, "Compute type A should be cutlass::bfloat16_t or cutlass::half_t or cutlass::float_e4m3_t");
+
+  using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync<
+                             DispatchPolicy::Load2TransformPipelineStageCount,
+                             AtomThrShapeMNK>;
+  using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState;
+
+  using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync<
+                              DispatchPolicy::Transform2MmaPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState;
+
+  using Mma2AccumPipeline =  cutlass::PipelineUmmaAsync<
+                              DispatchPolicy::Schedule::AccumulatorPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState;
+
+  // Thread Counts
+  static constexpr uint32_t NumAccumThreads = 128; //Maintains compatibility with input_transform kernel
+  static constexpr uint32_t NumTransformationThreads = 128;
+
+  // Get the Algorithm parameters
+  constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{});
+
+  using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom;
+  using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom;
+  using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom;
+  using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom;
+
+  using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom;
+  using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom;
+  using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom;
+  using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomACompute{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(CtaShapeB_NK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert((cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value ||
+                 cute::is_base_of<cute::UMMA::tmem_frg_base,      typename TiledMma::FrgTypeA>::value  ) &&
+                 cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop.");
+  static_assert((cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyA - invalid TMA copy atom specified.");
+  static_assert((cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyB -  invalid TMA copy atom specified.");
+
+  struct PipelineStorage {
+    using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage;
+    alignas(16) Load2TransformPipelineStorage load2transform_pipeline;
+    using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage;
+    alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline;
+    using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage;
+    alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline;
+  };
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+
+      struct TensorStorageUntransformed {
+        cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+        cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> smem_B;
+      };
+
+      struct TensorStorageTransformedAinSmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutACompute>> smem_ACompute;
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_BCompute;
+      };
+
+      union TensorStorageTransformedAinTmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, 1> smem_ACompute;  // No smem_ACompute
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_BCompute;
+      };
+
+      using TensorStorageTransformed = cute::conditional_t<
+                                      cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value,
+                                      TensorStorageTransformedAinSmem,
+                                      TensorStorageTransformedAinTmem>;
+
+      TensorStorageUntransformed input;
+      TensorStorageTransformed compute;
+    } tensors;
+
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on
+  // loaded input A and B matrices to convert the data type
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+    cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})), 
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    using TMA_B = decltype(make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x && 
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    } 
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N,K,L), args.dB));
+  
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE static void
+  prefetch_tma_descriptors(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      if (is_fallback_cluster) {
+        cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor());
+      }
+      else {
+        cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+      }
+    }
+    else {
+      cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+    }
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto 
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+      Params const& params,
+      Load2TransformPipeline pipeline,
+      Load2TransformPipelineState load2xform_pipeline_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+
+    // slice out the work coord from tiled tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK mainloop_load2xform_pipeline_state for _writing_
+      pipeline.producer_acquire(load2xform_pipeline_state, pipeline_flag);
+      int write_stage = load2xform_pipeline_state.index();
+
+      using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(load2xform_pipeline_state);
+
+      // Advance mainloop_pipe
+      ++load2xform_pipeline_state;
+
+      skip_wait = (k_tile_count <= 1);
+      pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+      copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+      copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(load2xform_pipeline_state, k_tile_iter);
+
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_storage) const {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl,                        // for scheduler
+        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b);           // multicast masks
+  }
+
+  template<
+    class KTileIterator, class Accumulator,
+    class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA,
+    class GTensorB
+  >
+  CUTLASS_DEVICE auto
+  transform(
+      Load2TransformPipeline load2transform_pipeline,
+      Load2TransformPipelineState load2transform_pipeline_consumer_state,
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_producer_state,
+      Accumulator accumulators,
+      cute::tuple<GTensorA, DstCopyA, SrcTensorA, DstTensorA,
+                  GTensorB> input_operands,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier);
+
+    // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tAdA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM or TMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM)
+    auto [unused_tAgA, dst_copy_A, tAsA, tAsACompute,
+          unused_tBgB] = input_operands;
+
+    // Create the tensors in registers
+    auto tArA = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
+    auto tArACompute = make_tensor<ElementAMma>(tAsA(_,_,_,_,0).shape());
+
+    auto tArA_x2 = recast<Array<ElementA,2>>(tArA);
+    auto tArACompute_x2 = recast<Array<ElementAMma,2>>(tArACompute);
+
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+    auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag);
+      transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag);
+
+      int load2transform_consumer_index = load2transform_pipeline_consumer_state.index(); // read stage
+      int transform2mma_producer_index = transform2mma_pipeline_producer_state.index(); //write stage
+
+      auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state;
+      auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state;
+
+      // Copy the input A matrix from SMEM
+      copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA);
+      //Transform Input A stored in registers
+      cute::transform(tArA_x2, tArACompute_x2, cutlass::NumericArrayConverter<ElementAMma, ElementA, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
+      //Transformed A stored in TMEM
+      copy(dst_copy_A, tArACompute, tAsACompute(_,_,_,_,transform2mma_producer_index));
+
+      // Loads from SMEM are done. Signal the mainloop load as early as possible
+      transform_bar.sync();
+      load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state);
+
+      // fence for SMEM writes
+      cutlass::arch::fence_view_async_shared();
+      if constexpr (is_tmem<decltype(tAsACompute)>::value) {
+        // fence for TMEM writes if A operand is coming from TMEM
+        cutlass::arch::fence_view_async_tmem_store();
+      }
+
+      // Let the MMA know we are done transforming
+      transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state);
+
+      // Next pipeline stage
+      ++load2transform_pipeline_consumer_state;
+      ++transform2mma_pipeline_producer_state;
+
+      skip_wait = (k_tile_count <= 1);
+      // Peek the next pipeline stage's barriers
+      load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+      transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+    }
+    return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state);
+  }
+
+  template<class ProblemShape_MNKL, class Accumulator>
+  CUTLASS_DEVICE auto
+  transform_init(
+      Params const& params,
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Accumulator accumulators,
+      TensorStorage& shared_storage) {
+
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); 
+    Tensor sA = as_position_independent_swizzle_tensor(sA_orig); //tCsA
+    Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); //tCsACompute
+
+    // Map input, compute, and fragment tensors to
+    //   Copy strategies and partitioned tensors. These will become the input
+    //   operands of the transform function. Depending on MMA atom type, the
+    //   operands can reside in SMEM or TMEM
+    auto setup_copy_ops = [&] (
+        auto tensor_input,
+        auto input_copy_atom,
+        auto tensor_compute,
+        auto make_fragment,
+        auto compute_copy_atom) constexpr {
+
+      auto fragment_compute = make_fragment(tensor_compute); //tCrA(Compute)
+      if constexpr (cute::is_tmem<cute::remove_cvref_t<decltype(fragment_compute)>>::value) {
+        // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation.
+        // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation
+        // See: TmemAllocMode::Duplicated.
+        Tensor tensor_input2x = [&] () constexpr {
+        if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) {
+          return make_tensor(tensor_input.data(),
+                             logical_product(tensor_input.layout(),
+                                             make_tile(make_tile(Layout<_2,_0>{},_),_,_,_)));   // ((128,16),m,k,PIPE)
+          }
+          else {
+            return tensor_input;
+          }
+        }();  //tCsA_2x 
+
+        fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); //tCrA.data()
+        auto reg2tmem_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,0,0));
+        auto thr_reg2tmem_tiled_copy = reg2tmem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2tmem_tiled_copy.partition_S(tensor_input2x);
+        auto partitioned_tensor_compute = thr_reg2tmem_tiled_copy.partition_D(fragment_compute);
+        return cute::make_tuple(reg2tmem_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+      else {
+        auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute);
+        auto reg2smem_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout<Shape <_128,_8>, Stride<  _8,_1>>{},
+                                                     tensor_compute(_,_,0,0).layout());
+
+        auto thr_reg2smem_tiled_copy = reg2smem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2smem_tiled_copy.partition_S(tensor_input);
+        auto partitioned_tensor_compute = thr_reg2smem_tiled_copy.partition_D(tensor_compute_ind_sw);
+
+        return cute::make_tuple(AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+    };
+
+    auto [dst_copy_A, tAsA, tAsACompute] =
+        setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{});
+
+    return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute,
+                            gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class TensorA, class TensorB
+  >
+  CUTLASS_DEVICE auto
+  mma(
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_consumer_state,
+      Mma2AccumPipeline mma2accum_pipeline,
+      Mma2AccumPipelineState mma2accum_pipeline_producer_state,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      cute::tuple<TensorA, TensorB> const& input_operands,
+      int k_tile_count
+  ) {
+    TiledMma tiled_mma;
+
+    auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+    ++next_transform2mma_pipeline_consumer_state;
+
+
+    // tCrA : (MMA), MMA_M, MMA_K, SmemStage  (In SMEM or TMEM)
+    //      We use SMEM stages to match #buffers in Load <-> Convert
+    // tCrB : (MMA), MMA_N, MMA_K, SmemStages (In SMEM)
+    auto const [tCrA, tCrB] = input_operands;
+
+    int remaining_accum_promotions = k_tile_count;
+    uint32_t mma2accum_skip_wait = (remaining_accum_promotions <= 0);
+    auto mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
+    mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state, mma2accum_flag);
+    auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state;
+    ++mma2accum_pipeline_producer_state;
+    
+    // No accumulator addition to the k_tile initially
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag);
+
+      int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index(); //read_stage
+      int mma2accum_pipeline_producer_state_index = curr_mma2accum_pipeline_producer_state.index();  //write_stage
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+
+        auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index);
+
+        auto tCrA0 = tCrA(_,_,_,transform2mma_pipeline_consumer_state_index);
+        auto tCrB0 = tCrB(_,_,_,transform2mma_pipeline_consumer_state_index);
+
+        cute::gemm(tiled_mma, tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC);               // A[0]*B[0]
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        
+      }
+
+      transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state);
+
+      skip_wait = (k_tile_count <= 1);
+      transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+
+      curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state;
+      ++next_transform2mma_pipeline_consumer_state;
+    }
+
+    mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state);
+
+    return cute::make_tuple(curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state);
+  }
+
+  template<class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TensorStorage& shared_storage) const {
+    TiledMma tiled_mma;
+
+    auto get_tCrA = [&] () constexpr {
+      if constexpr (cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value) {
+        Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+        return tiled_mma.make_fragment_A(sACompute);
+      }
+      else {
+        auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{}));
+        tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        return tCrA;
+      }
+    };
+
+    Tensor tCrA = get_tCrA();
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});
+    Tensor tCrB = tiled_mma.make_fragment_B(sB);
+    return cute::make_tuple(tCrA, tCrB);
+  }
+
+  template<class FrgEngine, class FrgLayout, class TmemCopyAtom, class EpilogueTile>
+  CUTLASS_DEVICE auto
+  accum_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) {
+    return accumulators;
+  }
+
+private:
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  constexpr auto
+  tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const {
+    using X = cute::Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
new file mode 100755
index 00000000..e7f43fd5
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
@@ -0,0 +1,1161 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120ArrayTmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120ArrayTmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+   // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+  using InternalStrideA  = cute::remove_pointer_t<StrideA>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using InternalStrideB  = cute::remove_pointer_t<StrideB>;
+  
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+
+
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+
+  // Gmem copies
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  // Smem copies
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB{}))>;
+
+  using SmemCopyAtomsA =  SmemCopyAtomsA_;
+  using SmemCopyAtomsB =  SmemCopyAtomsB_;
+
+  using SmemCopyAtomA   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+
+  using SmemCopyAtomB   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_SFA;
+      cute::TmaDescriptor smem_tensormap_SFB;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    cute::TmaDescriptor* tensormaps;
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementSF const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementSF const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shapes, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    constexpr int tma_alignment_bits = 128;
+    auto init_M = tma_alignment_bits;
+    auto init_N = tma_alignment_bits;
+    auto init_K = tma_alignment_bits;
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t init_L = 1;
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+    ElementSF const* ptr_SFA_first_batch = nullptr;
+    ElementSF const* ptr_SFB_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    InternalLayoutSFA layout_SFA;
+    InternalLayoutSFB layout_SFB;
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+      layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+      layout_SFA = args.layout_SFA;
+      layout_SFB = args.layout_SFB;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+    Tensor tensor_sfa = make_tensor(ptr_SFA_first_batch, layout_SFA);
+    Tensor tensor_sfb = make_tensor(ptr_SFB_first_batch, layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFA),
+      args.layout_SFA,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFB),
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 4;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t init_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                          // (m,k,l)
+    Tensor mB_nkl = params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                          // (n,k,l)
+    
+    // Represent the full tensor of Scale factors
+    InternalLayoutSFA layout_SFA{};
+    InternalLayoutSFB layout_SFB{};
+    if constexpr (IsGroupedGemmKernel) {
+      layout_SFA = params.layout_SFA[0];
+      layout_SFB = params.layout_SFB[0];
+    }
+    else {
+      layout_SFA = params.layout_SFA;
+      layout_SFB = params.layout_SFB;
+    }
+
+    Tensor mSFA_mkl = params.tma_load_sfa.get_tma_tensor(shape(layout_SFA));
+    Tensor mSFB_nkl = params.tma_load_sfb.get_tma_tensor(shape(layout_SFB));
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (TILE_N,TILE_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class TensorMapA, class TensorMapB,
+    class TensorMapSFA, class TensorMapSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B, SFA and SFB
+      //
+
+      auto [gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl] = load_inputs;
+
+      auto block_tma_a = params.tma_load_a.get_slice(0);
+      auto block_tma_b = params.tma_load_b.get_slice(0);
+
+      auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+      auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA =   gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+      Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+      Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+      // Partition source and destination tensors for tma copies
+      Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                        // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                        // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                        // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                        // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(params.tma_load_a.with(get<0>(input_tensormaps),*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_b.with(get<1>(input_tensormaps),*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        copy(params.tma_load_sfa.with(get<2>(input_tensormaps),*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(get<3>(input_tensormaps),*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+
+        // Advance k tile
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+    }
+    __syncwarp();
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] Params const& params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy from smem to registers
+    //
+
+    // A
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    // B
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                          //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                       // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                         //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                        // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                        // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                    // PIPE
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                    // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                    // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                               // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                               // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                              // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                              // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA) == size<2>(sSFA));                                    // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB) == size<2>(sSFA));                                    // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+    auto tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+    auto tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+    
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+
+      
+      // Copy smem->rmem for SFA/SFB operand
+      copy(tCsSFA_stage(_,_,k_block), tCrSFA_copy_view(_,_,k_block));
+      copy(tCsSFB_stage(_,_,k_block), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block)), make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+        
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+          tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+      
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+}
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+
+
+ //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_sfa = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_sfb = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      Tensor pSFA_tensormap = make_tensor(mainloop_params.tma_load_sfa.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFA), Int<1>{}, Int<1>{});
+      Tensor pSFB_tensormap = make_tensor(mainloop_params.tma_load_sfb.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFB), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+      copy(recast<uint128_t>(pSFA_tensormap), recast<uint128_t>(sSFA_tensormap));
+      copy(recast<uint128_t>(pSFB_tensormap), recast<uint128_t>(sSFB_tensormap));
+    }
+    __syncwarp();
+    return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_sfa, tma_desc_sfb);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                    mainloop_params.ptr_SFA[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                    mainloop_params.ptr_SFB[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFA  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFA = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFB  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFB = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    ElementSF const* ptr_SF = nullptr;
+    Tensor tensor_sfa = make_tensor(ptr_SF, mainloop_params.layout_SFA[next_group]);
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfa, tensor_sfa,
+                                             prob_shape_SFA, prob_stride_SFA);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
+                                             prob_shape_B, prob_stride_B);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfb, tensor_sfb,
+                                             prob_shape_SFB, prob_stride_SFB);
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFA) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFB) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                            prob_shape_SFA,
+                                                            prob_stride_SFA);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                            prob_shape_SFB,
+                                                            prob_stride_SFB);                                                   
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+
+    tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_SFA);
+    tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
index 00fd0338..e4acdf32 100755
--- a/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
+++ b/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
@@ -285,14 +285,14 @@ struct CollectiveMma<
     using TMA_A = decltype(make_tma_copy(
         GmemTiledCopyA{},
         make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
         make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
         _1{}));  // No programmatic multicast
     // Assumption: StrideB is congruent with Problem_NK
     using TMA_B = decltype(make_tma_copy(
         GmemTiledCopyB{},
         make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
         make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
         _1{}));  // No programmatic multicast
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index 6eae3771..27c03af4 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -46,6 +46,8 @@
 #include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::gemm::collective {
@@ -57,14 +59,11 @@ template <
   int Stages,
   class ClusterShape,
   class KernelSchedule,
-  int ScaleGranularityM_,
-  int ScaleGranularityN_,
-  int ScalePromotionInterval_,
   class TileShape_,
   class ElementA_,
-  class StrideA_,
+  class StridePairA_,
   class ElementB_,
-  class StrideB_,
+  class StridePairB_,
   class TiledMma_,
   class GmemTiledCopyA_,
   class SmemLayoutAtomA_,
@@ -75,12 +74,12 @@ template <
   class SmemCopyAtomB_,
   class TransformB_>
 struct CollectiveMma<
-    MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>,
+    MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<Stages, ClusterShape, KernelSchedule>,
     TileShape_,
     ElementA_,
-    StrideA_,
+    StridePairA_,
     ElementB_,
-    StrideB_,
+    StridePairB_,
     TiledMma_,
     GmemTiledCopyA_,
     SmemLayoutAtomA_,
@@ -94,14 +93,18 @@ struct CollectiveMma<
   //
   // Type Aliases
   //
-  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>;
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<Stages, ClusterShape, KernelSchedule>;
   using TileShape = TileShape_;
   using ElementA = ElementA_;
-  using StrideA = StrideA_;
+  using StrideA = cute::tuple_element_t<0,StridePairA_>;
+  using LayoutSFA = cute::tuple_element_t<1,StridePairA_>;
   using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
   using ElementB = ElementB_;
-  using StrideB = StrideB_;
+  using StrideB = cute::tuple_element_t<0,StridePairB_>;
+  using LayoutSFB = cute::tuple_element_t<1,StridePairB_>;
   using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
   using TiledMma = TiledMma_;
   using ElementAccumulator = typename TiledMma::ValTypeC;
   using ElementBlockScale = ElementAccumulator;
@@ -121,11 +124,16 @@ struct CollectiveMma<
   using PipelineParams = typename MainloopPipeline::Params;
   using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
 
-  static constexpr int NumProducerThreadEvents = 2;
+  static constexpr int NumProducerThreadEvents = 33;
 
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
-  static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape{}) : ScaleGranularityN_;
-  static constexpr int ScalePromotionInterval = ScalePromotionInterval_;
+  static constexpr int ScaleGranularityM = size<0,0>(InternalLayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(InternalLayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(InternalLayoutSFA{});
+
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0);
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0);
+
+  static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
   static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
 
   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
@@ -142,6 +150,10 @@ struct CollectiveMma<
   static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
   static_assert((size<1>(TileShape{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
 
+  using ScaleConfig = ::cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(TileShape{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(TileShape{}));
+
   // Tile along modes in a way that maximizes the TMA box size.
   using SmemLayoutA = decltype(tile_to_shape(
       SmemLayoutAtomA{},
@@ -153,14 +165,23 @@ struct CollectiveMma<
       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
 
   // Block scaling gmem-to-smem copy atom 
-  using BlockScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementBlockScale)) * ScaleMsPerTile, 16)>;
-  using BlockScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementBlockScale)) * ScaleNsPerTile, 16)>;
-  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<BlockScaleCopyTypeA>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<BlockScaleCopyTypeB>, ElementBlockScale>;
+  //  we can have partial tiles in M or N, so don't vectorize those loads
+  using CopyAtomSFA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using CopyAtomSFB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  
+  static constexpr int AlignmentSFA = 1;
+  static constexpr int AlignmentSFB = 1;
 
   // Block scaling smem layout
-  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<ScaleNsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
 
   static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
   static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
@@ -185,8 +206,8 @@ struct CollectiveMma<
     struct TensorStorage : cute::aligned_struct<128, _0> {
       cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
       cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A;
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B;
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
     } tensors;
 
     struct TensorMapStorage : cute::aligned_struct<128, _0> {
@@ -209,8 +230,10 @@ struct CollectiveMma<
     StrideA dA;
     ElementB const** ptr_B;
     StrideB dB;
-    ElementBlockScale const** ptr_scale_A;
-    ElementBlockScale const** ptr_scale_B;
+    ElementBlockScale const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementBlockScale const** ptr_SFB;
+    LayoutSFB layout_SFB;
   };
 
   // Device side kernel params
@@ -238,8 +261,10 @@ struct CollectiveMma<
     InternalElementB const** ptr_B;
     StrideB dB;
     // Block scaling factors for A and B
-    ElementBlockScale const** ptr_scale_A; 
-    ElementBlockScale const** ptr_scale_B;
+    ElementBlockScale const** ptr_SFA; 
+    LayoutSFA layout_SFA;
+    ElementBlockScale const** ptr_SFB;
+    LayoutSFB layout_SFB;
   };
 
   //
@@ -307,8 +332,10 @@ struct CollectiveMma<
       args.dA,
       reinterpret_cast<InternalElementB const**>(args.ptr_B),
       args.dB,
-      args.ptr_scale_A,
-      args.ptr_scale_B
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
     };
   }
 
@@ -372,8 +399,8 @@ struct CollectiveMma<
   load_init(
     ProblemShape_MNKL const& problem_shape_MNKL,
     Params const& mainloop_params,
-    ElementBlockScale const* ptr_scale_A = nullptr,
-    ElementBlockScale const* ptr_scale_B = nullptr
+    ElementBlockScale const* ptr_SFA = nullptr,
+    ElementBlockScale const* ptr_SFB = nullptr
   ) const {
 
     using X = Underscore;
@@ -383,27 +410,21 @@ struct CollectiveMma<
 
     // TMA requires special handling of strides to deal with coord codomain mapping
     // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                            // (n,k,l)
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                        // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                        // (n,k,l)
 
     // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
-    auto tK = get<3>(gA_mkl.shape());
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N,BLK_K,n,k,l)
 
     // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(ceil_div(M, ScaleGranularityM), tK, L); // (scale_m,k,l)
-    auto scaleB_shape = make_shape(ceil_div(N, ScaleGranularityN), tK, L); // (scale_n,k,l)
-    auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{});
-    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_0, _1, _2>{});
 
-    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
-    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(ptr_SFA), 
+        ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, init_L)));                              // (scale_m,k,l)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(ptr_SFB), 
+        ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, init_L)));                              // (scale_n,k,l)
 
-    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(ptr_scale_A), scaleA_layout); // (scale_m,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(ptr_scale_B), scaleB_layout); // (scale_n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
 
   }
 
@@ -430,10 +451,12 @@ struct CollectiveMma<
     int lane_predicate = cute::elect_one_sync();
     // Blockscaling: Tma loads for load_input and CpAsync for load_scale
     if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});                        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});                        // (BLK_N,BLK_K,PIPE)
-      Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{});  // (ScaleMsPerTile,k)
-      Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{});  // (ScaleNsPerTile,k)
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), 
+          SmemLayoutSFA{});                                                                           // (BLK_M,BLK_K,P)
+      Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), 
+          SmemLayoutSFB{});                                                                           // (BLK_N,BLK_K,P)
 
       //
       // Prepare the TMA loads for A and B
@@ -454,26 +477,26 @@ struct CollectiveMma<
       Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
 
       // Block scaling: load_scale has scaling tensors in global memory which are not tiled
-      Tensor mScaleA_mkl = get<2>(load_inputs);
-      Tensor mScaleB_nkl = get<3>(load_inputs);
+      Tensor mSFA_mkl = get<2>(load_inputs);
+      Tensor mSFB_nkl = get<3>(load_inputs);
 
-      auto scales_m = get<0>(mScaleA_mkl.shape());
-      auto scales_n = get<0>(mScaleB_nkl.shape());
+      Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});   // (BLK_M,BLK_K,m,k,l)
+      Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});   // (BLK_N,BLK_K,n,k,l)
 
-      Tensor gScaleA = local_tile(mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-      Tensor gScaleB = local_tile(mScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}), make_coord(n_coord,_,l_coord));                   // (ScaleNsPerTile,k,1)
+      Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+      Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
 
-      TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ScaleMsPerTile>>>{});
-      TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ScaleNsPerTile>>>{});
+      TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{});
+      TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{});
 
-      ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
-      ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+      ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(_0{});
+      ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(_0{});
 
-      Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
-      Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+      Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+      Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
 
-      Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
-      Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+      Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+      Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
 
       // Applies the mapping from block_tma_a
       Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
@@ -503,8 +526,7 @@ struct CollectiveMma<
 
       // Mainloop
       CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count)
-      {
+      for ( ; k_tile_count > 0; --k_tile_count) {
         // LOCK smem_pipe_write for _writing_
         pipeline.producer_acquire(smem_pipe_write);
 
@@ -519,11 +541,6 @@ struct CollectiveMma<
         copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
         copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
 
-        // Copy scale tensors from global memory to shared memory
-        copy(scale_copy_a, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-        copy(scale_copy_b, tBgB_ScaleB(_,_,*k_tile_iter), tBsB_ScaleB(_,_,write_stage));
-        pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
-
         ++k_tile_iter;
 
         // Advance smem_pipe_write
@@ -548,6 +565,117 @@ struct CollectiveMma<
     }
   }
 
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_auxiliary(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, 
+                  TensorB, 
+                  TensorSFA, 
+                  TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), 
+        SmemLayoutSFA{});                                                                             // (BLK_M,BLK_K,P)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), 
+        SmemLayoutSFB{});                                                                             // (BLK_N,BLK_K,P)
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mSFA_mkl  = get<2>(load_inputs);
+    Tensor mSFB_nkl  = get<3>(load_inputs);
+    Layout layoutSFA = mSFA_mkl.layout();
+    Layout layoutSFB = mSFB_nkl.layout();
+
+    Tensor iSFA_mkl = make_identity_tensor(shape(layoutSFA));                                // (m,k,l)
+    Tensor iSFB_nkl = make_identity_tensor(shape(layoutSFB));                                // (n,k,l)
+
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor cSFA_mkl = local_tile(iSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+    Tensor cSFB_nkl = local_tile(iSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor cSFA_k = cSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
+    Tensor cSFB_k = cSFB_nkl(_,_,n_coord,_,l_coord);
+
+    TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+    TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
+
+    Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+    Tensor tSFAcSFA_k = thr_scale_copy_a.partition_S(cSFA_k);
+    Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
+
+    Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+    Tensor tSFBcSFB_k = thr_scale_copy_b.partition_S(cSFB_k);
+    Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
+
+    Tensor tSFApSFA = make_tensor<bool>(shape(filter_zeros(tSFAsSFA(_,_,_,_0{}))));                 // (CPY,CPY_M,CPY_K)
+    Tensor tSFBpSFB = make_tensor<bool>(shape(filter_zeros(tSFBsSFB(_,_,_,_0{}))));                 // (CPY,CPY_N,CPY_K)
+
+    auto SFA_shape = shape(layoutSFA);
+    auto SFB_shape = shape(layoutSFB);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      // Since scale granularity K is multiple of BLK_K we do not have to consider if that is OOB
+      bool load_sfa = thread_idx < ScaleMsPerTile;
+      Tensor tSFAcSFA = tSFAcSFA_k(_,_,_,*k_tile_iter);
+      Tensor tSFAcSFA_compact = filter_zeros(tSFAcSFA);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFApSFA); ++i) {
+        tSFApSFA(i) = load_sfa && elem_less(tSFAcSFA_compact(i), SFA_shape);
+      }
+
+      bool load_sfb = thread_idx < ScaleNsPerTile;
+      Tensor tSFBcSFB = tSFBcSFB_k(_,_,_,*k_tile_iter);
+      Tensor tSFBcSFB_compact = filter_zeros(tSFBcSFB);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFBpSFB); ++i) {
+        tSFBpSFB(i) = load_sfb && elem_less(tSFBcSFB_compact(i), SFB_shape);
+      }
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tSFApSFA, filter_zeros(tSFAgSFA_k(_,_,_,*k_tile_iter)), filter_zeros(tSFAsSFA(_,_,_,write_stage)));
+      copy_if(scale_copy_b, tSFBpSFB, filter_zeros(tSFBgSFB_k(_,_,_,*k_tile_iter)), filter_zeros(tSFBsSFB(_,_,_,write_stage)));
+
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
 
   template<
     class EngineAccum,
@@ -604,20 +732,30 @@ struct CollectiveMma<
     static_assert(cute::is_void_v<SmemCopyAtomB>,
       "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
 
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});           // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});           // (BLK_N,BLK_K,PIPE)
 
     // Block scaling
-    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
-      Layout<
-        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
-        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
-      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleBViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()),
-      Layout<
-        Shape<cute::tuple_element_t<0, TileShape>, Shape<Int<ScaleGranularityN>, Int<ScaleNsPerTile>>, Int<DispatchPolicy::Stages>>,
-        Stride<_0, Stride<_0, _1>, Int<ScaleNsPerTile>>
-      >{}); // (m,(ScaleGranularityN,ScaleNsPerTile),k)
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), make_layout(
+        make_shape(shape<0>(SmemLayoutSFA{}), 
+                   get<1>(TileShape{}), 
+                   make_shape(shape<1>(SmemLayoutSFA{}), 
+                              shape<2>(SmemLayoutSFA{}))),
+        make_stride(stride<0>(SmemLayoutSFA{}), _0{}, 
+                    make_stride(stride<1>(SmemLayoutSFA{}), 
+                                stride<2>(SmemLayoutSFA{})))
+      ));                                                                                     // (BLK_M,BLK_N,(BLK_K,P))
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), make_layout(
+        make_shape(get<0>(TileShape{}), 
+                   shape<0>(SmemLayoutSFB{}), 
+                   make_shape(shape<1>(SmemLayoutSFB{}), 
+                              shape<2>(SmemLayoutSFB{}))),
+        make_stride(_0{}, 
+                    stride<0>(SmemLayoutSFB{}), 
+                    make_stride(stride<1>(SmemLayoutSFB{}), 
+                                stride<2>(SmemLayoutSFB{})))
+      ));                                                                                     // (BLK_M,BLK_N,(BLK_K,P))
+
 
     //
     // Define C accumulators and A/B partitioning
@@ -640,8 +778,9 @@ struct CollectiveMma<
     TiledMma tiled_mma;
     auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
 
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
-    Tensor tCsScaleBViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleBViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+    Tensor tCsSFA = tiled_mma.get_slice(thread_idx).partition_C(sSFA);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+    Tensor tCsSFB = tiled_mma.get_slice(thread_idx).partition_C(sSFB);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+
 
     Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
     Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
@@ -667,8 +806,9 @@ struct CollectiveMma<
     PipelineState smem_pipe_release = smem_pipe_read;
 
     // Per block scale values for operand A and B
-    Tensor tCrScaleAViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleAViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
-    Tensor tCrScaleBViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleBViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
+    // Since scale factors always broadcast across MMA_K we slice that away
+    Tensor tCrSFA = make_tensor_like<ElementBlockScale>(tCsSFA(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+    Tensor tCrSFB = make_tensor_like<ElementBlockScale>(tCsSFB(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
 
     // Prologue GMMAs
     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
@@ -676,10 +816,6 @@ struct CollectiveMma<
     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
     // fence_operand();
     GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
     {
       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
@@ -697,49 +833,122 @@ struct CollectiveMma<
 
       int read_stage = smem_pipe_read.index();
       // Load per block scale values from shared memory to registers
-      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
-      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
+        ElementBlockScale scale_b = tCrSFB(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
         }
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_a = tCrSFA(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
-          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
         }
       }
+
       warpgroup_arrive();
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
+        // (V,M) x (V,N) => (V,M,N)
         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
       }
 
       warpgroup_commit_batch();
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_ab = tCrSFA(_0{});
         scale_if_needed(accumulation, scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC);
+        scale_if_needed(accumulation, tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      // Load per block scale values from shared memory to registers
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
 
       ++smem_pipe_read;
@@ -763,26 +972,28 @@ struct CollectiveMma<
       int read_stage = smem_pipe_read.index();
       // fence_operand();
       // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
-      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
-      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
       }
-      if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
         }
       }
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) {
-        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
-          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
         }
       }
 
+
       if constexpr (ScalePromotionInterval != 4) {
         if (accumulation.prepare_if_needed()) {
           tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
@@ -799,7 +1010,7 @@ struct CollectiveMma<
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
+        // (V,M) x (V,N) => (V,M,N)
         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
       }
@@ -809,19 +1020,19 @@ struct CollectiveMma<
       warpgroup_wait<K_PIPE_MMAS>();
       warpgroup_fence_operand(accumulation());
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_ab = tCrSFA(_0{});
         scale_if_needed(accumulation, scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC);
+        scale_if_needed(accumulation, tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
 
       // UNLOCK smem_pipe_release, done _computing_ on it
@@ -834,17 +1045,17 @@ struct CollectiveMma<
     if constexpr (ScalePromotionInterval != 4) {
       // residues only exists when granularity is not the finnest
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_ab = tCrSFA(_0{});
         accumulation.scale_residue_if_needed(scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+        accumulation.scale_residue_if_needed(tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        accumulation.scale_residue_if_needed(tCrScaleBViewAsC);
+        accumulation.scale_residue_if_needed(tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        accumulation.scale_residue_if_needed(tCrScaleAViewAsC, tCrScaleBViewAsC);
+        accumulation.scale_residue_if_needed(tCrSFA, tCrSFB);
       }
     }
 
@@ -1014,8 +1225,8 @@ struct CollectiveMma<
       return load_init(
         problem_shape_mnkl,
         mainloop_params,
-        mainloop_params.ptr_scale_A[next_batch],
-        mainloop_params.ptr_scale_B[next_batch]
+        mainloop_params.ptr_SFA[next_batch],
+        mainloop_params.ptr_SFB[next_batch]
       );
     } else {
       auto [gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl] = input_tensors;
@@ -1023,8 +1234,8 @@ struct CollectiveMma<
       auto scaleA_layout = mScaleA_mkl.layout();
       auto scaleB_layout = mScaleB_nkl.layout();
 
-      mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A[next_batch]), scaleA_layout); // (m,ScaleMsPerTile,k,l)
-      mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B[next_batch]), scaleB_layout); // (n,ScaleNsPerTile,k,l)
+      mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA[next_batch]), scaleA_layout); // (m,ScaleMsPerTile,k,l)
+      mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB[next_batch]), scaleB_layout); // (n,ScaleNsPerTile,k,l)
       return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
     }
   }
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index e6d3c089..a510298d 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -45,6 +45,8 @@
 #include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::gemm::collective {
@@ -57,14 +59,11 @@ template <
   int Stages,
   class ClusterShape,
   class KernelSchedule,
-  int ScaleGranularityM_,
-  int ScaleGranularityN_,
-  int ScalePromotionInterval_,
   class TileShape_,
   class ElementA_,
-  class StrideA_,
+  class StridePairA_,
   class ElementB_,
-  class StrideB_,
+  class StridePairB_,
   class TiledMma_,
   class GmemTiledCopyA_,
   class SmemLayoutAtomA_,
@@ -75,12 +74,12 @@ template <
   class SmemCopyAtomB_,
   class TransformB_>
 struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>,
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule>,
     TileShape_,
     ElementA_,
-    StrideA_,
+    StridePairA_,
     ElementB_,
-    StrideB_,
+    StridePairB_,
     TiledMma_,
     GmemTiledCopyA_,
     SmemLayoutAtomA_,
@@ -89,17 +88,18 @@ struct CollectiveMma<
     GmemTiledCopyB_,
     SmemLayoutAtomB_,
     SmemCopyAtomB_,
-    TransformB_>
-{
+    TransformB_> {
   //
   // Type Aliases
   //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_, ScalePromotionInterval_>;
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule>;
   using TileShape = TileShape_;
   using ElementA = ElementA_;
-  using StrideA = StrideA_;
+  using StrideA = cute::tuple_element_t<0,StridePairA_>;
+  using LayoutSFA = cute::tuple_element_t<1,StridePairA_>;
   using ElementB = ElementB_;
-  using StrideB = StrideB_;
+  using StrideB = cute::tuple_element_t<0,StridePairB_>;
+  using LayoutSFB = cute::tuple_element_t<1,StridePairB_>;
   using TiledMma = TiledMma_;
   using ElementAccumulator = typename TiledMma::ValTypeC;
   using ElementBlockScale = ElementAccumulator;
@@ -118,12 +118,17 @@ struct CollectiveMma<
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
   using PipelineParams = typename MainloopPipeline::Params;
 
-  // Two threads per CTA are producers (1 for operand tile `tma`, and 32 for scales `cp.async`)
+  // 33 threads per CTA are producers (1 for operand tile `tma`, and 32 for scales `cp.async`)
   static constexpr int NumProducerThreadEvents = 33;
 
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
-  static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape{}) : ScaleGranularityN_;
-  static constexpr int ScalePromotionInterval = ScalePromotionInterval_;
+  static constexpr int ScaleGranularityM = size<0,0>(LayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(LayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(LayoutSFA{});
+
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0);
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0);
+
+  static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
   static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
   static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
@@ -139,6 +144,10 @@ struct CollectiveMma<
   static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
   static_assert((size<1>(TileShape{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
 
+  using ScaleConfig = ::cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(TileShape{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(TileShape{}));
+
   // Tile along modes in a way that maximizes the TMA box size.
   using SmemLayoutA = decltype(tile_to_shape(
       SmemLayoutAtomA{},
@@ -151,12 +160,22 @@ struct CollectiveMma<
 
   // Block scaling gmem-to-smem copy atom 
   //  we can have partial tiles in M or N, so don't vectorize those loads
-  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using CopyAtomSFA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using CopyAtomSFB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+
+  static constexpr int AlignmentSFA = 1;
+  static constexpr int AlignmentSFB = 1;
 
   // Block scaling smem layout
-  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<ScaleNsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
 
   static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
   static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
@@ -174,8 +193,8 @@ struct CollectiveMma<
     struct TensorStorage : cute::aligned_struct<128> {
       cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
       cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // ScaleNsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFA>> smem_SFA; // ScaleMsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFB>> smem_SFB; // ScaleNsPerTile x k
     } tensors;
 
     using PipelineStorage = typename MainloopPipeline::SharedStorage;
@@ -191,8 +210,10 @@ struct CollectiveMma<
     ElementB const* ptr_B;
     StrideB dB;
     uint32_t mma_promotion_interval = 4;
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
+    ElementBlockScale const* ptr_SFA; 
+    LayoutSFA layout_SFA;
+    ElementBlockScale const* ptr_SFB;
+    LayoutSFB layout_SFB;
   };
 
   // Device side kernel params
@@ -217,8 +238,10 @@ struct CollectiveMma<
     uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
     uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
     // Block scaling factors for A and B
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
+    ElementBlockScale const* ptr_SFA; 
+    LayoutSFA layout_SFA;
+    ElementBlockScale const* ptr_SFB;
+    LayoutSFB layout_SFB;
   };
 
   //
@@ -261,8 +284,10 @@ struct CollectiveMma<
       transaction_bytes,
       transaction_bytes_mk,
       transaction_bytes_nk,
-      args.ptr_scale_A,
-      args.ptr_scale_B
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
     };
   }
 
@@ -325,27 +350,19 @@ struct CollectiveMma<
 
     // TMA requires special handling of strides to deal with coord codomain mapping
     // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                             // (n,k,l)
 
     // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    auto tK = get<3>(gA_mkl.shape());
-
-    // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(ceil_div(M, ScaleGranularityM), tK, L); // (scale_m,k,l)
-    auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{});
-    auto scaleB_shape = make_shape(ceil_div(N, ScaleGranularityN), tK, L); // (scale_n,k,l)
-    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_0, _1, _2>{});
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N,BLK_K,n,k,l)
 
     // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
     // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
-    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (scale_n,k,l)
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA), mainloop_params.layout_SFA);  // (scale_m,k,l)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB), mainloop_params.layout_SFB);  // (scale_n,k,l)
 
-    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
   }
 
   /// Perform a collective-scoped matrix multiply-accumulate
@@ -370,8 +387,8 @@ struct CollectiveMma<
     // Blockscaling: Tma loads for load_input and CpAsync for load_scale
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (ScaleNsPerTile,k)
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), SmemLayoutSFA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), SmemLayoutSFB{}); // (BLK_M,BLK_K,PIPE)
 
     //
     // Prepare the TMA loads for A and B
@@ -393,63 +410,46 @@ struct CollectiveMma<
 
 
     // Block scaling: load_scale has scaling tensors in global memory which are not tiled
-    Tensor mScaleA_mkl = get<2>(load_inputs);
-    Tensor mScaleB_nkl = get<3>(load_inputs);
-    auto scales_m = get<0>(mScaleA_mkl.shape());
-    auto scales_n = get<0>(mScaleB_nkl.shape());
+    Tensor mSFA_mkl = get<2>(load_inputs);
+    Tensor mSFB_nkl = get<3>(load_inputs);
 
-    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
-    Tensor cScaleB_nkl = make_identity_tensor(mScaleB_nkl.shape());
+    Tensor iSFA_mkl = make_identity_tensor(shape(mainloop_params.layout_SFA));                                // (m,k,l)
+    Tensor iSFB_nkl = make_identity_tensor(shape(mainloop_params.layout_SFB));                                // (n,k,l)
 
-    Tensor gScaleA = local_tile(
-      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
-      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-    Tensor cScaleA = local_tile(
-      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
-      make_coord(m_coord,_,l_coord));
-    Tensor gScaleB = local_tile(
-      mScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}),
-      make_coord(n_coord,_,l_coord));                   // (ScaleNsPerTile,k,1)
-    Tensor cScaleB = local_tile(
-      cScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}),
-      make_coord(n_coord,_,l_coord));
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor cSFA_mkl = local_tile(iSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+    Tensor cSFB_nkl = local_tile(iSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
 
-    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+    Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor cSFA_k = cSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
+    Tensor cSFB_k = cSFB_nkl(_,_,n_coord,_,l_coord);
+
+    TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{},
       Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
-    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
+    TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{}, 
       Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
-    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
-    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
 
-    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
-    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
-    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+    Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+    Tensor tSFAcSFA_k = thr_scale_copy_a.partition_S(cSFA_k);
+    Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
 
-    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
-    Tensor tBcB_ScaleB = thr_scale_copy_b.partition_S(cScaleB);
-    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+    Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+    Tensor tSFBcSFB_k = thr_scale_copy_b.partition_S(cSFB_k);
+    Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
 
     // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                 // (TMA,TMA_M,TMA_K,PIPE)
 
-    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                 // (TMA,TMA_N,TMA_K,PIPE)
 
-    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
-    Tensor tBpB_ScaleB = make_tensor<bool>(shape(tBsB_ScaleB(_,_,0)));
-
-    #pragma unroll
-    for (int i = 0; i < size(tApA_ScaleA); ++i) {
-      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) <
-        std::min(scales_m, (m_coord + 1) * ScaleMsPerTile);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < size(tBpB_ScaleB); ++i) {
-      tBpB_ScaleB(i) = get<0>(tBcB_ScaleB(i)) <
-        std::min(scales_n, (n_coord + 1) * ScaleNsPerTile);
-    }
+    Tensor tSFApSFA = make_tensor<bool>(shape(filter_zeros(tSFAsSFA(_,_,_,_0{}))));                 // (CPY,CPY_M,CPY_K)
+    Tensor tSFBpSFB = make_tensor<bool>(shape(filter_zeros(tSFBsSFB(_,_,_,_0{}))));                 // (CPY,CPY_N,CPY_K)
 
     uint16_t mcast_mask_a = 0;
     uint16_t mcast_mask_b = 0;
@@ -470,12 +470,32 @@ struct CollectiveMma<
       }
     }
 
+    auto SFA_shape = shape(mainloop_params.layout_SFA);
+    auto SFB_shape = shape(mainloop_params.layout_SFB);
+
     // Mainloop
     CUTLASS_PRAGMA_NO_UNROLL
     for ( ; k_tile_count > 0; --k_tile_count) {
       // LOCK smem_pipe_write for _writing_
       pipeline.producer_acquire(smem_pipe_write);
 
+      // Since scale granularity K is multiple of BLK_K we do not have to consider if that is OOB
+      Tensor tSFAcSFA = tSFAcSFA_k(_,_,_,*k_tile_iter);
+      Tensor tSFAcSFA_compact = filter_zeros(tSFAcSFA);
+      bool load_sfa = thread_idx < ScaleMsPerTile;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFApSFA); ++i) {
+        tSFApSFA(i) = load_sfa && elem_less(get<0>(tSFAcSFA_compact(i)), get<0>(SFA_shape));
+      }
+
+      bool load_sfb = thread_idx < ScaleNsPerTile;
+      Tensor tSFBcSFB = tSFBcSFB_k(_,_,_,*k_tile_iter);
+      Tensor tSFBcSFB_compact = filter_zeros(tSFBcSFB);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFBpSFB); ++i) {
+        tSFBpSFB(i) = load_sfb && elem_less(get<0>(tSFBcSFB_compact(i)), get<0>(SFB_shape));
+      }
+
       //
       // Copy gmem to smem for *k_tile_iter
       //
@@ -488,8 +508,8 @@ struct CollectiveMma<
       if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
 
       // Copy scale tensors from global memory to shared memory
-      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-      copy_if(scale_copy_b, tBpB_ScaleB, tBgB_ScaleB(_,_,*k_tile_iter), tBsB_ScaleB(_,_,write_stage));
+      copy_if(scale_copy_a, tSFApSFA, filter_zeros(tSFAgSFA_k(_,_,_,*k_tile_iter)), filter_zeros(tSFAsSFA(_,_,_,write_stage)));
+      copy_if(scale_copy_b, tSFBpSFB, filter_zeros(tSFBgSFB_k(_,_,_,*k_tile_iter)), filter_zeros(tSFBsSFB(_,_,_,write_stage)));
       pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
 
       ++k_tile_iter;
@@ -577,16 +597,24 @@ struct CollectiveMma<
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
 
     // Block scaling
-    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
-      Layout<
-        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
-        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
-      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleBViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()),
-      Layout<
-        Shape<cute::tuple_element_t<0, TileShape>, Shape<Int<ScaleGranularityN>, Int<ScaleNsPerTile>>, Int<DispatchPolicy::Stages>>,
-        Stride<_0, Stride<_0, _1>, Int<ScaleNsPerTile>>
-      >{}); // (m,(ScaleGranularityN,ScaleNsPerTile),k)
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), make_layout(
+        make_shape(get<0>(shape(SmemLayoutSFA{})), 
+                   get<1>(TileShape{}), 
+                   make_shape(get<1>(shape(SmemLayoutSFA{})), 
+                   get<2>(shape(SmemLayoutSFA{})))),
+        make_stride(get<0>(stride(SmemLayoutSFA{})), _0{}, 
+                    make_stride(get<1>(stride(SmemLayoutSFA{})), get<2>(stride(SmemLayoutSFA{}))))
+      ));                                                                                       // (BLK_M,BLK_N,(BLK_K,P))
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), make_layout(
+        make_shape(get<0>(TileShape{}), 
+                   get<0>(shape(SmemLayoutSFB{})), 
+                   make_shape(get<1>(shape(SmemLayoutSFB{})), 
+                   get<2>(shape(SmemLayoutSFB{})))),
+        make_stride(_0{}, 
+                    get<0>(stride(SmemLayoutSFB{})), 
+                    make_stride(get<1>(stride(SmemLayoutSFB{})), 
+                    get<2>(stride(SmemLayoutSFB{}))))
+      ));                                                                                       // (BLK_M,BLK_N,(BLK_K,P))
 
     //
     // Define C accumulators and A/B partitioning
@@ -609,22 +637,22 @@ struct CollectiveMma<
     TiledMma tiled_mma;
     auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
 
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
-    Tensor tCsScaleBViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleBViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+    Tensor tCsSFA = tiled_mma.get_slice(thread_idx).partition_C(sSFA);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+    Tensor tCsSFB = tiled_mma.get_slice(thread_idx).partition_C(sSFB);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
 
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCsA = thread_mma.partition_A(sA);                                                  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                  // (MMA,MMA_N,MMA_K,PIPE)
 
     // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                            // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                            // (MMA,MMA_N,MMA_K,PIPE)
 
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                          // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                          // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                           // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                          // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                          // PIPE
 
     //
     // PIPELINED MAIN LOOP
@@ -636,8 +664,9 @@ struct CollectiveMma<
     PipelineState smem_pipe_release = smem_pipe_read;
 
     // Per block scale values for operand A and B
-    Tensor tCrScaleAViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleAViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
-    Tensor tCrScaleBViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleBViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
+    // Since scale factors always broadcast across MMA_K we slice that away
+    Tensor tCrSFA = make_tensor_like<ElementBlockScale>(tCsSFA(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+    Tensor tCrSFB = make_tensor_like<ElementBlockScale>(tCsSFB(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
 
     // Prologue GMMAs
     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
@@ -645,9 +674,6 @@ struct CollectiveMma<
     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
 
     GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
     {
       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
@@ -666,23 +692,24 @@ struct CollectiveMma<
       int read_stage = smem_pipe_read.index();
 
       // Load per block scale values from shared memory to registers
-      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
-      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
+        ElementBlockScale scale_b = tCrSFB(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
         }
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_a = tCrSFA(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
-          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
         }
       }
 
@@ -696,19 +723,88 @@ struct CollectiveMma<
       }
       warpgroup_commit_batch();
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_ab = tCrSFA(_0{});
         scale_if_needed(accumulation, scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC);
+        scale_if_needed(accumulation, tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+
+      ++smem_pipe_read;
+    }
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
 
       ++smem_pipe_read;
@@ -732,26 +828,28 @@ struct CollectiveMma<
       int read_stage = smem_pipe_read.index();
 
       // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
-      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
-      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
+        ElementBlockScale scale_b = tCrSFB(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
         }
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_a = tCrSFA(_0{});
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
-          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
         }
       }
 
+
       if constexpr (ScalePromotionInterval != 4) {
         if (accumulation.prepare_if_needed()) {
           tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
@@ -777,19 +875,19 @@ struct CollectiveMma<
       warpgroup_wait<K_PIPE_MMAS>();
       warpgroup_fence_operand(accumulation());
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        ElementBlockScale scale_ab = tCrSFA(_0{});
         scale_if_needed(accumulation, scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC);
+        scale_if_needed(accumulation, tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrScaleAViewAsC, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
 
       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
@@ -802,17 +900,17 @@ struct CollectiveMma<
     if constexpr (ScalePromotionInterval != 4) {
       // residues only exists when granularity is not the finnest
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
-        accumulation.scale_residue_if_needed(scale_ab);
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+        scale_if_needed(accumulation, tCrSFA);
       }
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        accumulation.scale_residue_if_needed(tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFB);
       }
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        accumulation.scale_residue_if_needed(tCrScaleAViewAsC, tCrScaleBViewAsC);
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
     }
 
@@ -827,7 +925,6 @@ struct CollectiveMma<
     k_tile_count -= prologue_mma_count;
 
     smem_pipe_release.advance(k_tile_count);
-
     // Wait on all GMMAs to complete
     warpgroup_wait<0>();
 
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index 7c829ea1..3f4455ee 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -35,7 +35,7 @@
 
 #include "cute/layout.hpp"
 #include "cute/numeric/integral_constant.hpp" // cute::false_type
-#include "cute/arch/copy_sm100.hpp" 
+#include "cute/atom/copy_traits_sm100.hpp"
 //////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::detail {
@@ -72,7 +72,8 @@ namespace detail {
 
 enum class KernelInputTransformType {
     FastF32,
-    InterleavedComplexTF32
+    InterleavedComplexTF32,
+    MixedInput
 };
 
 } // namespace detail
@@ -117,42 +118,9 @@ struct KernelPtrArrayTmaWarpSpecializedCooperative { };
 struct KernelPtrArrayTmaWarpSpecializedPingpong { };
 
 // FP8 related policies (including Blocked Scaled Accumulation)
-template<
-  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
-  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
-  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
-  int ScaleGranularityM_ = 0,
-  int ScaleGranularityN_ = 0,
-  // `ScalePromotionInterval` specifies the interval to promote the accumulator for scaling
-  // It is required to be a multiple of 4 and specified in terms of number of MMA instructions
-  // in the reduction dimension. i.e for FP8 kernels, it is 
-  // ScalePromotionInterval * MMA_K = ScalePromotionInterval * 32 = 128 elements in K by default
-  int ScalePromotionInterval_ = 4
-
->
-struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelTmaWarpSpecializedCooperative {
-  constexpr static int ScaleGranularityM = ScaleGranularityM_;
-  constexpr static int ScaleGranularityN = ScaleGranularityN_;
-  constexpr static int ScalePromotionInterval = ScalePromotionInterval_;
-};
-
-template<
-  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
-  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
-  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
-  int ScaleGranularityM_,
-  int ScaleGranularityN_,
-  // `ScalePromotionInterval` specifies the interval to promote the accumulator for scaling
-  // It is required to be a multiple of 4 and specified in terms of number of MMA instructions
-  // in the reduction dimension. i.e for FP8 kernels, it is 
-  // ScalePromotionInterval * MMA_K = ScalePromotionInterval * 32 = 128 elements in K by default
-  int ScalePromotionInterval_ = 4
->
-struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelPtrArrayTmaWarpSpecializedCooperative {
-  constexpr static int ScaleGranularityM = ScaleGranularityM_;
-  constexpr static int ScaleGranularityN = ScaleGranularityN_;
-  constexpr static int ScalePromotionInterval = ScalePromotionInterval_;
-};
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum: KernelPtrArrayTmaWarpSpecializedPingpong { };
 
 // Policies to opt into mixed type GEMMs
 struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
@@ -334,22 +302,12 @@ struct MainloopSm90TmaGmmaWarpSpecializedFP8
 template<
   int Stages_,
   class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecialized,
-  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
-  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
-  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
-  int ScaleGranularityM = 0,
-  int ScaleGranularityN = 0,
-  // `ScalePromotionInterval` specifies the interval to promote the accumulator for scaling
-  // It is required to be a multiple of 4 and specified in terms of number of MMA instructions
-  // in the reduction dimension. i.e for FP8 kernels, it is 
-  // ScalePromotionInterval * MMA_K = ScalePromotionInterval * 32 = 128 elements in K by default
-  int ScalePromotionInterval = 4
+  class KernelSchedule = KernelTmaWarpSpecialized
 >
 struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8
   : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
   static_assert(
-    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM, ScaleGranularityN, ScalePromotionInterval>>,
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>,
     "KernelSchedule must be one of the warp specialized policies");
 };
 
@@ -431,18 +389,16 @@ struct MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput {
 template<
   int Stages_,
   class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative,
-  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
-  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
-  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
-  int ScaleGranularityM = 0,
-  int ScaleGranularityN = 0,
-  int ScalePromotionInterval = 4
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
 >
 struct MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling
   : MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
   static_assert(
-    cute::is_same_v<KernelSchedule, KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM, ScaleGranularityN>>,
+    cute::is_any_of_v<
+      KernelSchedule,
+      KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum,
+      KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum
+    >,
     "KernelSchedule must be one of the warp specialized policies");
 };
 
@@ -567,6 +523,28 @@ struct KernelTmaWarpSpecializedPingpongBlockScaledSm120 : KernelTmaWarpSpecializ
   static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
 };
 
+// SM120 dense Ptr-array kernel schedules
+template< int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedCooperativeSm120 : KernelPtrArrayTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedPingpongSm120 : KernelPtrArrayTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedCooperativeBlockScaledSm120 : KernelPtrArrayTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedPingpongBlockScaledSm120 : KernelPtrArrayTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+// SM120 sparse kernel schedules
 template< int SchedulerPipelineStageCount_, bool isAsymmetric_>
 struct KernelTmaWarpSpecializedCooperativeSparseSm120 {
   static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
@@ -578,6 +556,25 @@ struct KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120 {
   static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
   static constexpr bool isAsymmetric = isAsymmetric_;
 };
+
+// Auxiliary Load Tag.
+
+template<class Policy>
+struct IsAuxiliaryLoadNeeded : cute::false_type{};
+
+template<
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule
+>
+struct IsAuxiliaryLoadNeeded<
+  MainloopSm90ArrayTmaGmmaWarpSpecializedBlockScaling<
+    Stages,
+    ClusterShape,
+    KernelSchedule
+  >
+> : cute::true_type{};
+
 //////////////////////////////////////////////////////////////////////////////
 
 //
@@ -651,6 +648,14 @@ struct KernelTmaWarpSpecialized2SmFastFP32Sm100 final : KernelSchedule2Sm, Kerne
 struct KernelTmaWarpSpecialized1SmFastFP32SmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
 struct KernelTmaWarpSpecialized2SmFastFP32SmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Mixed Precision Input GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100MixedInputGemm           : KernelScheduleSm100 {};
+struct KernelTmaWarpSpecializedMixedInputSmemSm100 : KernelScheduleSm100MixedInputGemm { };
+struct KernelTmaWarpSpecialized1SmMixedInputSm100 final : KernelSchedule1Sm, KernelScheduleSm100MixedInputGemm { };
+struct KernelTmaWarpSpecialized1SmMixedInputSmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedMixedInputSmemSm100 { };
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // SM100 Ptr-Array FastF32 (9xBF16) GEMM Dispatch Policies
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -997,16 +1002,30 @@ template<
 struct MainloopSm120TmaWarpSpecialized {
   constexpr static int Stages = Stages_;
   using ClusterShape = ClusterShape_;
-  using KernelSchedule = KernelSchedule_;
-
+  using Schedule = KernelSchedule_;
   constexpr static int PipelineAsyncMmaStages = 0;
   using ArchTag = arch::Sm120;
-  
-  using Schedule = cute::conditional_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelSchedule>, 
-                                       KernelTmaWarpSpecializedPingpongSm120<SchedulerPipelineStageCount_>, 
-                                       KernelTmaWarpSpecializedCooperativeSm120<SchedulerPipelineStageCount_>>;
 };
 
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120ArrayTmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, Schedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, Schedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");                                     
+};
+
+
 template<
   int Stages_,
   int SchedulerPipelineStageCount_,
@@ -1017,17 +1036,31 @@ struct MainloopSm120TmaWarpSpecializedBlockScaled {
   constexpr static int Stages = Stages_;
   constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
   using ClusterShape = ClusterShape_;
-  using KernelSchedule = KernelSchedule_;
-
+  using Schedule = KernelSchedule_;
   constexpr static int PipelineAsyncMmaStages = 0;
   using ArchTag = arch::Sm120;
-
-  using Schedule = cute::conditional_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelSchedule>, 
-                                       KernelTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>, 
-                                       KernelTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>>;
-
 };
 
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120ArrayTmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using Schedule = KernelSchedule_;
+  using ArchTag = arch::Sm120;
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, Schedule> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, Schedule>, 
+                "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies.");
+};
+
+
 template<
   int StagesA_,
   int StagesB_,
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
index d6485e1b..055b56e3 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
@@ -176,10 +176,13 @@ public:
   using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
   using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
 
-  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
   using CLCPipelineState = typename CLCPipeline::PipelineState;
-
-  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
   using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
 
   using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
@@ -331,7 +334,8 @@ public:
     if constexpr (IsGroupedGemmKernel) {
       // Group GEMM currently only supports rank-3 problem shapes
       implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
+    }
+    else {
       implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
     }
     if (!implementable) {
@@ -486,7 +490,7 @@ public:
     auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
     int cluster_size = size(cluster_shape);
     uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
-    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    bool is_first_cta_in_cluster = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
     int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
     bool is_mma_leader_cta = cta_coord_v == 0;
     constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
@@ -555,22 +559,43 @@ public:
 
     // CLC pipeline
     typename CLCPipeline::Params clc_pipeline_params;
+
     if (WarpCategory::Sched == warp_category) {
-      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
     }
     else {
       clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
     }
-    clc_pipeline_params.producer_blockid = 0;
-    clc_pipeline_params.producer_arv_count = 1;
-    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
-                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
-    if (is_epi_load_needed) {
-      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
-    }
-    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+
     clc_pipeline_params.initializing_warp = 1;
-    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+    // Now declare the pipeline outside the if constexpr
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
 
     // Mainloop-Epilogue pipeline
     typename AccumulatorPipeline::Params accumulator_pipeline_params;
@@ -592,16 +617,18 @@ public:
 
     // CLC throttle pipeline
     typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
-    if (WarpCategory::MainloopLoad == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
     }
-    if (WarpCategory::Sched == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
-    }
-    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
-    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
-    clc_throttle_pipeline_params.dst_blockid = 0;
-    clc_throttle_pipeline_params.initializing_warp = 3;
     CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
     CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
     CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
@@ -666,7 +693,7 @@ public:
 
     // TileID scheduler
     TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
-    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
     auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
     
     //
@@ -797,6 +824,9 @@ public:
     }
 
     else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
@@ -841,6 +871,16 @@ public:
         } while (work_tile_info.is_valid());
         clc_pipeline.producer_tail(clc_pipe_producer_state);
       }
+      else {
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
     }
 
     else if (is_participant.mma) {
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
index 0055039a..57c00407 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
@@ -170,10 +170,15 @@ public:
 
   using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
 
-  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
-  using CLCPipelineState = cutlass::PipelineState<SchedulerPipelineStageCount>;
 
-  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
   using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
 
   using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
@@ -428,7 +433,7 @@ public:
     int cta_rank_in_cluster = cute::block_rank_in_cluster();
     auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
     int cluster_size                = size(cluster_shape);
-    bool is_first_cta_in_cluster    = (cta_rank_in_cluster == 0);
+    bool is_first_cta_in_cluster    = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
     bool is_mma_leader_cta          = (cta_rank_in_cluster % size<0>(TiledMma{}) == 0);
     // Even if this variable is unused, shape_div still performs useful compile-time checks.
     [[maybe_unused]] auto mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
@@ -552,38 +557,61 @@ public:
     // Operates Scheduling Warp <--> All Warps
     typename CLCPipeline::Params clc_pipeline_params;
     if (WarpCategory::Sched == warp_category) {
-      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
     }
     else {
       clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
     }
-    clc_pipeline_params.producer_blockid = 0;
-    clc_pipeline_params.producer_arv_count = 1;
-    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
-                                                 (NumMainloopLoadThreads + NumEpilogueThreads +
-                                                  NumMMAThreads + NumTransformationThreads);
-    if (is_epi_load_needed) {
-      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
-    }
-    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+
     clc_pipeline_params.initializing_warp = 1;
-    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                                   NumTransformationThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                               NumTransformationThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+    
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
 
     CLCPipelineState clc_pipeline_consumer_state;
     CLCPipelineState clc_pipeline_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
 
     // CLC throttle pipeline
     typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
-    if (WarpCategory::MainloopLoad == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
     }
-    if (WarpCategory::Sched == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
-    }
-    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
-    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
-    clc_throttle_pipeline_params.dst_blockid = 0;
-    clc_throttle_pipeline_params.initializing_warp = 3;
     CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
     CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
     CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
@@ -804,9 +832,14 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
+
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
+      // Grouped GEMM uses static tile scheduler
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
@@ -849,6 +882,16 @@ public:
         } while (work_tile_info.is_valid());
         clc_pipeline.producer_tail(clc_pipeline_producer_state);
       }
+      else {
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipeline_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipeline_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipeline_producer_state);
+      }
     }
 
     else if (is_participant.mma) {
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
index dba296fd..c036c0af 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
@@ -179,10 +179,14 @@ public:
   using AccumulatorPipeline = typename CollectiveMainloop::AccumulatorPipeline;
   using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
 
-  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
   using CLCPipelineState = typename CLCPipeline::PipelineState;
 
-  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
   using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
 
   using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
@@ -339,7 +343,8 @@ public:
     if constexpr (IsGroupedGemmKernel) {
       // Group GEMM currently only supports rank-3 problem shapes
       implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
+    }
+    else {
       implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
     }
     if (!implementable) {
@@ -490,11 +495,14 @@ public:
     WarpCategory warp_category = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
       if (warp_idx < static_cast<int>(WarpCategory::Epilogue)) {
         return WarpCategory(warp_idx);
-      } else if (warp_idx < static_cast<int>(WarpCategory::MainloopSFLoad)) {
+      }
+      else if (warp_idx < static_cast<int>(WarpCategory::MainloopSFLoad)) {
         return WarpCategory::Epilogue;
-      } else if (warp_idx == static_cast<int>(WarpCategory::MainloopSFLoad)) {
+      }
+      else if (warp_idx == static_cast<int>(WarpCategory::MainloopSFLoad)) {
         return WarpCategory::MainloopSFLoad;
-      } else {
+      }
+      else {
         return WarpCategory::Unused;
       }
     }();
@@ -504,7 +512,7 @@ public:
     auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
     int cluster_size = size(cluster_shape);
     uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
-    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    bool is_first_cta_in_cluster = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
     int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
     bool is_mma_leader_cta = cta_coord_v == 0;
     constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
@@ -590,22 +598,43 @@ public:
     // CLC pipeline
     typename CLCPipeline::Params clc_pipeline_params;
     if (WarpCategory::Sched == warp_category) {
-      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
     }
     else {
       clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
     }
-    clc_pipeline_params.producer_blockid = 0;
-    clc_pipeline_params.producer_arv_count = 1;
-    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
-                                                 (NumMainloopABLoadThreads + NumEpilogueThreads + 
-                                                  NumMainloopSFLoadThreads + NumMMAThreads);
-    if (is_epi_load_needed) {
-      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
-    }
-    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+
     clc_pipeline_params.initializing_warp = 1;
-    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopABLoadThreads + NumEpilogueThreads + 
+                                                    NumMainloopSFLoadThreads + NumMMAThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopABLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                               NumMainloopSFLoadThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+
+    CLCPipeline clc_pipeline = [&] () {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    } ();
 
     // Mainloop-Epilogue pipeline
     typename AccumulatorPipeline::Params accumulator_pipeline_params;
@@ -625,16 +654,18 @@ public:
 
     // CLC throttle pipeline
     typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
-    if (WarpCategory::MainloopABLoad == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopABLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopABLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
     }
-    if (WarpCategory::Sched == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
-    }
-    clc_throttle_pipeline_params.producer_arv_count = NumMainloopABLoadThreads;
-    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
-    clc_throttle_pipeline_params.dst_blockid = 0;
-    clc_throttle_pipeline_params.initializing_warp = 3;
     CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
     CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
     CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
@@ -944,6 +975,16 @@ public:
         } while (work_tile_info.is_valid());
         clc_pipeline.producer_tail(clc_pipe_producer_state);
       }
+      else {
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
     }
 
     else if (is_participant.mma) {
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
index 0f09629d..222a7ad1 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
@@ -649,6 +649,9 @@ public:
     }
 
     else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
index afadb309..ae712512 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
@@ -717,7 +717,9 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
-      
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
 
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
@@ -1007,6 +1009,39 @@ public:
           // Advance the mm2accum pipe
           mma2accum_pipeline_consumer_state = mma2accum_pipeline_consumer_state_next;
         }
+        else if constexpr (InputTransformType == cutlass::gemm::detail::KernelInputTransformType::MixedInput) {
+
+          mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state);
+
+          // Accumulators
+          Tensor accumulators = bulk_tmem(_,_,_,mma2accum_pipeline_consumer_state.index()); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+          mma2accum_pipeline_consumer_state = scheduler.template fixup<IsComplex>(
+            TiledMma{},
+            work_tile_info,
+            accumulators,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            typename CollectiveEpilogue::CopyOpT2R{}
+          );
+
+          //
+          // Epilogue and write to gD
+          //
+          if (scheduler.compute_epilogue(work_tile_info)) {
+            auto [mma2accum_pipeline_state_next] = collective_epilogue(
+              mma2accum_pipeline,
+              mma2accum_pipeline_consumer_state,
+              problem_shape_MNKL,
+              CtaShape_MNK{},
+              cta_coord_mnkl,
+              accumulators,
+              shared_storage.tensors.epilogue
+            );
+            // Advance the mma2accum pipe
+            mma2accum_pipeline_consumer_state = mma2accum_pipeline_state_next;
+          }
+        }
         // Complex kernels use a collective epilogue
         else {
           mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state);
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
index 9aec1636..1826cce9 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
@@ -769,6 +769,10 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
+
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
diff --git a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
index 02a5fca2..a3494d33 100644
--- a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
@@ -687,6 +687,9 @@ public:
     }
 
     else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        cutlass::arch::wait_on_dependent_grids();
+      }
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
diff --git a/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
index 6ceba52b..806d9026 100755
--- a/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
@@ -95,23 +95,6 @@ public:
   };
 
   struct Arguments {
-
-    Arguments() = default;
-    Arguments(Arguments const&) = default;
-    Arguments(Arguments&&) = default;
-
-    CUTLASS_HOST_DEVICE
-    Arguments&
-    operator=(Arguments const&) {
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    Arguments&
-    operator=(Arguments &&) {
-      return *this;
-    }
-
     int max_swizzle_size = 0;
     RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
   };
diff --git a/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp b/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp
index 2e64eac3..1c92efbc 100755
--- a/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp
+++ b/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp
@@ -54,18 +54,19 @@ namespace cutlass::gemm::kernel::detail {
 // If we had access to host-side problem shapes, one could to use it to figure out the grid shape
 // and thereafter use CLC query (which can then be linearized and mapped to an approriate tile coord).
 
-template<class GroupProblemShape>
+template<class GroupProblemShape, int SchedulerPipelineStageCount>
 class PersistentTileSchedulerSm100Group {
 
 public:
-  using UnderlyingScheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
+  using UnderlyingScheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
   using UnderlyingProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
   using Params = PersistentTileSchedulerSm100GroupParams<UnderlyingProblemShape>;
   using WorkTileInfo = typename UnderlyingScheduler::WorkTileInfo;
   using Arguments = typename UnderlyingScheduler::Arguments;
   using RasterOrder = typename Params::RasterOrder;
   using RasterOrderOptions = typename Params::RasterOrderOptions;
-  struct CLCResponse { uint32_t data[4]; };
+
+  using CLCResponse = WorkTileInfo;
   
   static constexpr bool IsDynamicPersistent = UnderlyingScheduler::IsDynamicPersistent;
 
@@ -123,18 +124,19 @@ public:
   PersistentTileSchedulerSm100Group() { }
 
   CUTLASS_DEVICE
-  PersistentTileSchedulerSm100Group(CLCResponse* /* clc_response_ptr */, Params const& params)
+  PersistentTileSchedulerSm100Group(CLCResponse* clc_response_ptr, Params const& params)
     : scheduler_params(params),
-      scheduler_sm90(params.params_sm90_) { }
+      scheduler_sm90(params.params_sm90_, clc_response_ptr) { }
 
   CUTLASS_DEVICE
-  PersistentTileSchedulerSm100Group(CLCResponse* /* clc_response_ptr */, Params const& params, dim3 /* block_id_in_cluster */)
+  PersistentTileSchedulerSm100Group(CLCResponse* clc_response_ptr, Params const& params, dim3 /* block_id_in_cluster */)
     : scheduler_params(params),
-      scheduler_sm90(params.params_sm90_) { }
+      scheduler_sm90(params.params_sm90_, clc_response_ptr) { }
 
-  template <class ClusterShape>
+  // Returns the initial work tile info that will be computed over
+  template <typename ClusterShape>
   CUTLASS_DEVICE
-  WorkTileInfo
+  auto
   initial_work_tile_info(ClusterShape cluster_shape) {
     return scheduler_sm90.initial_work_tile_info(cluster_shape);
   }
@@ -194,6 +196,17 @@ public:
     );
   }
 
+  template <typename CLCPipeline, typename CLCPipelineState>
+  CUTLASS_DEVICE
+  auto
+  advance_to_next_work(
+    CLCPipeline& clc_pipeline,
+    CLCPipelineState clc_pipe_producer_state,
+    uint32_t advance_count = 1) {
+
+    return scheduler_sm90.advance_to_next_work(clc_pipeline, clc_pipe_producer_state, advance_count);
+  }
+
   //
   // K Tile API
   //
@@ -282,10 +295,10 @@ public:
   auto
   fetch_next_work(
     WorkTileInfo work_tile_info,
-    [[maybe_unused]] CLCPipeline& clc_pipeline,
-    [[maybe_unused]] CLCPipelineState clc_pipe_consumer_state) {
+    CLCPipeline& clc_pipeline,
+    CLCPipelineState clc_pipe_consumer_state) {
 
-    return scheduler_sm90.fetch_next_work(work_tile_info);
+    return scheduler_sm90.fetch_next_work(work_tile_info, clc_pipeline, clc_pipe_consumer_state);
   }
 
 private:
@@ -300,7 +313,6 @@ private:
   //
   // Storage
   //
-  CLCResponse *clc_response_ptr_ = nullptr;
   Params scheduler_params;
 };
 
diff --git a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
index c63656c6..ac57aa6d 100644
--- a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
+++ b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
@@ -551,6 +551,10 @@ public:
       if (producer_warp_role == ProducerWarpRole::Warp1) {
         work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
 
+        if constexpr (IsSchedDynamicPersistent) {
+          cutlass::arch::wait_on_dependent_grids();
+        }
+
         if constexpr (IsSchedDynamicPersistent) {
           bool requires_clc_query = true;
           TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
index 10dae6c1..95c18b3f 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -69,6 +69,17 @@ class GemmUniversal<
   cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
 >
 {
+  // Get the type of the scheduler response.
+  template<typename TileScheduler, typename = void>
+  struct TileSchedulerResponseGetter {
+    using Type = typename TileScheduler::CLCResponse;
+  };
+
+  template<typename TileScheduler>
+  struct TileSchedulerResponseGetter<TileScheduler, void_t<typename TileScheduler::SchedulerResponse>> {
+    using Type = typename TileScheduler::SchedulerResponse;
+  };
+
 public:
   //
   // Type Aliases
@@ -111,22 +122,42 @@ public:
   using EpilogueParams = typename CollectiveEpilogue::Params;
 
   static_assert(ArchTag::kMinComputeCapability >= 90);
-  static_assert(cute::is_void_v<TileScheduler_>,
-    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
 
   static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
   static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
-  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
-    typename detail::TileSchedulerSelector<
-      GroupScheduler, ArchTag,
-      TileShape, ClusterShape,
-      2, // Default unused parameter - SchedulerPipelineStageCount
-      ProblemShape>::Scheduler,
-    typename detail::TileSchedulerSelector<
-    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  static_assert(
+    cute::is_void_v<TileScheduler_>
+    or (
+      IsGroupedGemmKernel
+      and cute::is_any_of_v<TileScheduler_, GroupScheduler>
+    ),
+    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
+
+  using SchedulerTag = cute::conditional_t<
+    cute::is_void_v<TileScheduler_>,
+    cute::conditional_t<
+      IsGroupedGemmKernel,
+      GroupScheduler,     // Special grouped gemm scheduler
+      void                // Default scheduler for non-grouped kernels
+    >,
+    TileScheduler_
+  >;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    SchedulerTag,
+    ArchTag,
+    TileShape,
+    ClusterShape,
+    8, // SchedulerPipelineStageCount -- Grouped GEMM scheduler will benefit from a larger number of stages.
+    cute::conditional_t<cute::is_same_v<SchedulerTag, void>, void, ProblemShape> // Use void for default scheduler.
+  >::Scheduler;
+
+  static constexpr auto TileSchedulerStages = 8;
+
   using TileSchedulerArguments = typename TileScheduler::Arguments;
   using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerResponse = typename TileSchedulerResponseGetter<TileScheduler>::Type;
 
   static constexpr uint32_t NumLoadWarpGroups = 1;
   static constexpr uint32_t NumMmaThreads = size(TiledMma{});
@@ -134,6 +165,7 @@ public:
   static constexpr uint32_t MaxThreadsPerBlock = NumMmaThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
   static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
   static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = IsAuxiliaryLoadNeeded<typename CollectiveMainloop::DispatchPolicy>::value;
 
   /// Register requirement for Load and Math WGs
   static constexpr uint32_t LoadRegisterRequirement = 40;
@@ -153,14 +185,18 @@ public:
     } tensors;
 
     struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using TileSchedulerPipelineStorage = typename TileScheduler::PipelineStorage;
       using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
       using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
 
+      alignas(16) TileSchedulerPipelineStorage scheduler;
       alignas(16) MainloopPipelineStorage mainloop;
       alignas(16) EpiLoadPipelineStorage epi_load;
       alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
     } pipelines;
 
+    alignas(16) TileSchedulerResponse scheduler_response[TileSchedulerStages];
+
     struct TensorMapStorage : cute::aligned_struct<128, _1> {
       using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
       using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
@@ -271,7 +307,8 @@ public:
     if constexpr (IsGroupedGemmKernel) {
       // Group GEMM currently only supports rank-3 problem shapes
       implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
+    }
+    else {
       implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
     }
     if (!implementable) {
@@ -375,9 +412,12 @@ public:
     using namespace cute;
     using X = Underscore;
 
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200))
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
 
     // Preconditions
@@ -404,14 +444,27 @@ public:
     };
     enum class ProducerWarpRole {
       Mainloop = 0,
-      Warp1 = 1,
+      MainloopAux = 1,
       Epilogue = 2,
-      Warp3 = 3
+      Scheduler = 3
     };
 
     // Kernel level shared memory storage
     SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
 
+    auto scheduler = [&] () {
+      // Group scheduler requires a different constructor that takes a response ptr
+      if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+        return TileScheduler{params.scheduler, shared_storage.scheduler_response};
+      }
+      else {
+        return TileScheduler{params.scheduler};
+      }
+    } ();
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
     int thread_idx = int(threadIdx.x);
     int lane_idx = canonical_lane_idx();
     int warp_idx = canonical_warp_idx_sync();
@@ -426,10 +479,33 @@ public:
 
     // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
 
+    // TileScheduler pipeline
+    using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+    typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params;
+    if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+      if (warp_group_role == WarpGroupRole::Producer
+       && producer_warp_role == ProducerWarpRole::Scheduler) {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer;
+      }
+      else {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      tile_scheduler_pipeline_params.consumer_arv_count = NumMmaThreads
+                                                        + NumThreadsPerWarp * (
+                                                          1                                                           // Main DMA warp
+                                                          + (collective_epilogue.is_producer_load_needed() ? 1 : 0)   // Epilog DMA warp
+                                                          + (IsMainloopAuxiliaryLoadNeeded ? 1 : 0)                   // Aux DMA warp
+                                                        );
+      tile_scheduler_pipeline_params.producer_arv_count = 1;
+    }
+    TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params);
+
     // Mainloop Load pipeline
     using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
     typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+    if (warp_group_role == WarpGroupRole::Producer
+      && (producer_warp_role == ProducerWarpRole::Mainloop
+       || producer_warp_role == ProducerWarpRole::MainloopAux)) {
       mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
     }
     if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
@@ -471,11 +547,13 @@ public:
 
     // Initialize starting pipeline states for the collectives
     // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename TileSchedulerPipeline::PipelineState tile_scheduler_pipe_consumer_state;
     typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
     typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
 
     // For the DMA Load (producer) we start with an opposite phase
     // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState tile_scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
     PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
     PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
     PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
@@ -499,16 +577,11 @@ public:
     const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
     const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
 
-    TileScheduler scheduler{params.scheduler};
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
     // Wait for all thread blocks in the Cluster
     cluster_wait_fn();
 
     auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
     if (not work_tile_info.is_valid()) {
       // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
       return;
@@ -533,8 +606,22 @@ public:
     if (warp_group_role == WarpGroupRole::Producer) {
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
 
+      if (producer_warp_role == ProducerWarpRole::Scheduler) {
+        // GroupScheduler requires a producer warp to iterate over the group infos and push
+        // the work tile infos to the downstream pipelines.
+        if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+          do {
+            auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_producer_state;
+            }
+          } while (work_tile_info.is_valid());
+          tile_scheduler_pipeline.producer_tail(tile_scheduler_pipe_producer_state);
+        }
+      }
       // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+      else if (producer_warp_role == ProducerWarpRole::Mainloop) {
         int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
         int32_t const mock_l_coord = 0;
         int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
@@ -544,26 +631,28 @@ public:
         auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
 
         // Update tensormap for the initial batch for the CTA
-        if (work_tile_info.is_valid()) {
-          collective_mainloop.tensormaps_perform_update(
-            shared_storage.tensormaps.mainloop,
-            params.mainloop,
-            input_tensormaps,
-            problem_shape_MNKL,
-            curr_batch
-          );
-          // Ensure warp is converged before issuing tensormap fence release
-          __syncwarp();
-          // Entire warp must do this (i.e. it's aligned)
-          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-        }
+        collective_mainloop.tensormaps_perform_update(
+          shared_storage.tensormaps.mainloop,
+          params.mainloop,
+          input_tensormaps,
+          problem_shape_MNKL,
+          curr_batch
+        );
+        // Ensure warp is converged before issuing tensormap fence release
+        __syncwarp();
+        // Entire warp must do this (i.e. it's aligned)
+        collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
 
         bool do_load_order_arrive = true;
         bool did_batch_change = true;
-        while (work_tile_info.is_valid()) {
+        do {
           if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+                work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
             work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
             continue;
           }
 
@@ -605,8 +694,11 @@ public:
           }
 
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
           work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
           auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
           did_batch_change = next_batch != curr_batch;
           if (work_tile_info.is_valid() && did_batch_change) {
@@ -633,12 +725,73 @@ public:
           }
           // Advance the producer state for the last remaining stage that was being waited for above
           mainloop_pipe_producer_state.advance(1);
-        } // Scheduler work fetch loop
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
       } // Mainloop Producer Warp End
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+          int32_t const mock_l_coord = 0;
 
+          bool did_batch_change = true;
+          do {
+            if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+              auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+              work_tile_info = next_work_tile_info;
+              if (increment_pipe) {
+                ++tile_scheduler_pipe_consumer_state;
+              }
+              continue;
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+            // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+            auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+            auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+            auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+            if (did_batch_change) {
+              load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            }
+
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, work_k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+            // Update starting pipeline state for the next tile
+            // Wait for the last TMA stage to complete loading, before issuing tensormap updates
+            mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+            did_batch_change = next_batch != curr_batch;
+            if (work_tile_info.is_valid() && did_batch_change) {
+              curr_batch = next_batch;
+              if constexpr (IsGroupedGemmKernel) {
+                problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+              }
+            }
+          } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+        }
+      }
       // Epilogue Producer Warp
       else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
         int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
@@ -649,28 +802,26 @@ public:
         bool did_batch_change = true;
         constexpr bool IsEpiLoad = true;
 
-        if (work_tile_info.is_valid()) {
-          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_load_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            0
-          );
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_load_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          0
+        );
 
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
 
-          load_order_barrier.wait();
-        }
+        load_order_barrier.wait();
 
-        while (work_tile_info.is_valid()) {
+        do {
           int32_t curr_batch = work_tile_info.L_idx;
 
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
 
           if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
             if constexpr (IsGroupedGemmKernel) {
@@ -703,8 +854,10 @@ public:
               wait
             );
           }
-
           work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
           did_batch_change = curr_batch != work_tile_info.L_idx;
 
           if (work_tile_info.is_valid() && did_batch_change) {
@@ -729,7 +882,7 @@ public:
             }
           }
 
-        } // Scheduler work fetch loop
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
@@ -752,27 +905,24 @@ public:
       bool did_batch_change = true;
       constexpr bool IsEpiLoad = false;
 
-      if (work_tile_info.is_valid()) {
+      if (warp_idx_in_warp_group == 0) {
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_store_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          consumer_warp_group_idx
+        );
 
-        if (warp_idx_in_warp_group == 0) {
-          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_store_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            consumer_warp_group_idx
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
-                                                                     epi_store_tensormap, 
-                                                                     consumer_warp_group_idx);
-        }
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
+                                                                    epi_store_tensormap, 
+                                                                    consumer_warp_group_idx);
       }
 
-      while (work_tile_info.is_valid()) {
+      do {
         if constexpr (IsGroupedGemmKernel) {
           problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
         }
@@ -791,9 +941,6 @@ public:
         // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
         auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
 
-        static_assert(cute::is_any_of_v<TileScheduler,
-            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
-            detail::PersistentTileSchedulerSm90>);
         if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
 
           collective_mainloop.mma(
@@ -851,8 +998,11 @@ public:
         }
 
         // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
         work_tile_info = next_work_tile_info;
+        if (increment_pipe) {
+          ++tile_scheduler_pipe_consumer_state;
+        }
 
         did_batch_change = curr_batch != work_tile_info.L_idx;
         if (work_tile_info.is_valid() && did_batch_change) {
@@ -877,7 +1027,7 @@ public:
           }
         }
 
-      } // Scheduler work fetch loop
+      } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
       // Cooperative only needs TMA to complete at the very end of the kernel
       if (do_store_tail) {
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
index e038a4cf..9d1657c7 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -69,6 +69,18 @@ class GemmUniversal<
   cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
 >
 {
+
+  // Get the type of the scheduler response.
+  template<typename TileScheduler, typename = void>
+  struct TileSchedulerResponseGetter {
+    using Type = typename TileScheduler::CLCResponse;
+  };
+
+  template<typename TileScheduler>
+  struct TileSchedulerResponseGetter<TileScheduler, void_t<typename TileScheduler::SchedulerResponse>> {
+    using Type = typename TileScheduler::SchedulerResponse;
+  };
+
 public:
   //
   // Type Aliases
@@ -111,28 +123,50 @@ public:
   using EpilogueParams = typename CollectiveEpilogue::Params;
 
   static_assert(ArchTag::kMinComputeCapability >= 90);
-  static_assert(cute::is_void_v<TileScheduler_>,
-    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
 
   static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
   static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
-  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
-    typename detail::TileSchedulerSelector<
-      GroupScheduler, ArchTag,
-      TileShape, ClusterShape,
-      2, // Default unused parameter - SchedulerPipelineStageCount
-      ProblemShape>::Scheduler,
-    typename detail::TileSchedulerSelector<
-    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  static_assert(
+    cute::is_void_v<TileScheduler_>
+    or (
+      IsGroupedGemmKernel
+      and cute::is_any_of_v<TileScheduler_, GroupScheduler>
+    ),
+    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
+
+  using SchedulerTag = cute::conditional_t<
+    cute::is_void_v<TileScheduler_>,
+    cute::conditional_t<
+      IsGroupedGemmKernel,
+      GroupScheduler,     // Special grouped gemm scheduler
+      void                // Default scheduler for non-grouped kernels
+    >,
+    TileScheduler_
+  >;
+
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    SchedulerTag,
+    ArchTag,
+    TileShape,
+    ClusterShape,
+    8, // SchedulerPipelineStageCount -- Grouped GEMM scheduler will benefit from a larger number of stages.
+    cute::conditional_t<cute::is_same_v<SchedulerTag, void>, void, ProblemShape> // Use void for default scheduler.
+  >::Scheduler;
+
   using TileSchedulerArguments = typename TileScheduler::Arguments;
   using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerResponse = typename TileSchedulerResponseGetter<TileScheduler>::Type;
+
+  static constexpr auto TileSchedulerStages = 8;
 
   static constexpr uint32_t NumLoadWarpGroups = 1;
   static constexpr uint32_t NumMmaWarpGroups = 2;
   static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
   static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
   static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = IsAuxiliaryLoadNeeded<typename CollectiveMainloop::DispatchPolicy>::value;
 
   /// Register requirement for Load and Math WGs
   static constexpr uint32_t LoadRegisterRequirement = 40;
@@ -159,16 +193,20 @@ public:
     } tensors;
 
     struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using TileSchedulerPipelineStorage = typename TileScheduler::PipelineStorage;
       using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
       using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
       using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
 
+      alignas(16) TileSchedulerPipelineStorage scheduler;
       alignas(16) MainloopPipelineStorage mainloop;
       alignas(16) EpiLoadPipelineStorage epi_load;
       alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
       alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
     } pipelines;
 
+    alignas(16) TileSchedulerResponse scheduler_response[TileSchedulerStages];
+
     struct TensorMapStorage : cute::aligned_struct<128, _1> {
       using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
       using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
@@ -283,7 +321,8 @@ public:
     if constexpr (IsGroupedGemmKernel) {
       // Group GEMM currently only supports rank-3 problem shapes
       implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
+    }
+    else {
       implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
     }
     if (!implementable) {
@@ -386,9 +425,12 @@ public:
     using namespace cute;
     using X = Underscore;
 
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200))
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
 
     // Preconditions
@@ -412,14 +454,28 @@ public:
     };
     enum class ProducerWarpRole {
       Mainloop = 0,
-      Warp1 = 1,
+      MainloopAux = 1,
       Epilogue = 2,
-      Warp3 = 3
+      Scheduler = 3
     };
 
     // Kernel level shared memory storage
     SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
 
+    auto scheduler = [&] () {
+      // Group scheduler requires a different constructor that takes a response ptr
+      if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+        return TileScheduler{params.scheduler, shared_storage.scheduler_response};
+      }
+      else {
+        return TileScheduler{params.scheduler};
+      }
+    } ();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
     int thread_idx = int(threadIdx.x);
     int lane_idx = canonical_lane_idx();
     int warp_idx = canonical_warp_idx_sync();
@@ -434,10 +490,32 @@ public:
 
     // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
 
+    // TileScheduler pipeline
+    using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+    typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params;
+    if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+      if (warp_group_role == WarpGroupRole::Producer
+        && producer_warp_role == ProducerWarpRole::Scheduler) {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer;
+      }
+      else {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      tile_scheduler_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup * NumMmaWarpGroups                   // 1 MATH WG
+                                                        + NumThreadsPerWarp * (
+                                                          1                                                           // Main DMA warp
+                                                          + (collective_epilogue.is_producer_load_needed() ? 1 : 0)   // Epilog DMA warp
+                                                          + (IsMainloopAuxiliaryLoadNeeded ? 1 : 0)                   // Aux DMA warp
+                                                        );
+      tile_scheduler_pipeline_params.producer_arv_count = 1;
+    }
+    TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params);
     // Mainloop Load pipeline
     using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
     typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+    if (warp_group_role == WarpGroupRole::Producer
+      && (producer_warp_role == ProducerWarpRole::Mainloop
+       || producer_warp_role == ProducerWarpRole::MainloopAux)) {
       mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
     }
     if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
@@ -485,11 +563,13 @@ public:
 
     // Initialize starting pipeline states for the collectives
     // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename TileSchedulerPipeline::PipelineState tile_scheduler_pipe_consumer_state;
     typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
     typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
 
     // For the DMA Load (producer) we start with an opposite phase
     // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState tile_scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
     PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
     PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
     PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
@@ -513,16 +593,11 @@ public:
     const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
     const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
 
-    TileScheduler scheduler{params.scheduler};
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
     // Wait for all thread blocks in the Cluster
     cluster_wait_fn();
 
     auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
     if (not work_tile_info.is_valid()) {
       // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
       return;
@@ -531,16 +606,21 @@ public:
     // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
     auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
 
-    if (warp_group_role == WarpGroupRole::Consumer1) {
+    // Consumer1 is not on the critical path at prologue.
+    if (warp_group_role == WarpGroupRole::Consumer1) [[unlikely]] {
       // Advance 2nd Math WG to the next work tile for the startup
       const auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
 
-      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
       work_tile_info = next_work_tile_info;
       if (!work_tile_info.is_valid()) {
         return;
       }
 
+      if (increment_pipe) {
+        ++tile_scheduler_pipe_consumer_state;
+      }
+
       // Advance 2nd Math WG pipeline states to the end of 1st Math WG
       mainloop_pipe_consumer_state.advance(k_tile_count);
       epi_load_pipe_consumer_state.advance(c_tile_count);
@@ -565,8 +645,20 @@ public:
     if (warp_group_role == WarpGroupRole::Producer) {
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
 
+      if (producer_warp_role == ProducerWarpRole::Scheduler) {
+        if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+          do {
+            auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_producer_state;
+            }
+          } while (work_tile_info.is_valid());
+          tile_scheduler_pipeline.producer_tail(tile_scheduler_pipe_producer_state);
+        }
+      }
       // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+      else if (producer_warp_role == ProducerWarpRole::Mainloop) {
         int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
         int32_t const mock_l_coord = 0;
         int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
@@ -576,26 +668,27 @@ public:
         auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
 
         // Update tensormap for the initial batch for the CTA
-        if (work_tile_info.is_valid()) {
-          collective_mainloop.tensormaps_perform_update(
-            shared_storage.tensormaps.mainloop,
-            params.mainloop,
-            input_tensormaps,
-            problem_shape_MNKL,
-            curr_batch
-          );
-          // Ensure warp is converged before issuing tensormap fence release
-          __syncwarp();
-          // Entire warp must do this (i.e. it's aligned)
-          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-        }
+        collective_mainloop.tensormaps_perform_update(
+          shared_storage.tensormaps.mainloop,
+          params.mainloop,
+          input_tensormaps,
+          problem_shape_MNKL,
+          curr_batch
+        );
+        // Ensure warp is converged before issuing tensormap fence release
+        __syncwarp();
+        // Entire warp must do this (i.e. it's aligned)
+        collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
 
         bool do_load_order_arrive = true;
         bool did_batch_change = true;
-        while (work_tile_info.is_valid()) {
+        do {
           if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
             work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
             continue;
           }
 
@@ -637,8 +730,11 @@ public:
           }
 
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
           work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
           auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
           did_batch_change = next_batch != curr_batch;
           if (work_tile_info.is_valid() && did_batch_change) {
@@ -665,12 +761,73 @@ public:
           }
           // Advance the producer state for the last remaining stage that was being waited for above
           mainloop_pipe_producer_state.advance(1);
-        } // Scheduler work fetch loop
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
       } // Mainloop Producer Warp End
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+          int32_t const mock_l_coord = 0;
 
+          bool did_batch_change = true;
+          do {
+            if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+              auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+              work_tile_info = next_work_tile_info;
+              if (increment_pipe) {
+                ++tile_scheduler_pipe_consumer_state;
+              }
+              continue;
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+            // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+            auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+            auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+            auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+            if (did_batch_change) {
+              load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            }
+
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, work_k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+
+            // Update starting pipeline state for the next tile
+            mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+            did_batch_change = next_batch != curr_batch;
+            if (work_tile_info.is_valid() && did_batch_change) {
+              curr_batch = next_batch;
+              if constexpr (IsGroupedGemmKernel) {
+                problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+              }
+            }
+          } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+        } // End of auxiliary load needed check
+      } // Mainloop Auxiliary Load Producer Warp End
       // Epilogue Producer Warp
       else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
         int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
@@ -681,28 +838,26 @@ public:
         bool did_batch_change = true;
         constexpr bool IsEpiLoad = true;
 
-        if (work_tile_info.is_valid()) {
-          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_load_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            0
-          );
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_load_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          0
+        );
 
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
 
-          load_order_barrier.wait();
-        }
+        load_order_barrier.wait();
 
-        while (work_tile_info.is_valid()) {
+        do {
           int32_t curr_batch = work_tile_info.L_idx;
 
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
 
           if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
             if constexpr (IsGroupedGemmKernel) {
@@ -737,6 +892,9 @@ public:
           }
 
           work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
           did_batch_change = curr_batch != work_tile_info.L_idx;
 
           if (work_tile_info.is_valid() && did_batch_change) {
@@ -761,7 +919,7 @@ public:
             }
           }
 
-        } // Scheduler work fetch loop
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
@@ -784,27 +942,24 @@ public:
       bool did_batch_change = true;
       constexpr bool IsEpiLoad = false;
 
-      if (work_tile_info.is_valid()) {
+      if (warp_idx_in_warp_group == 0) {
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_store_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          consumer_warp_group_idx
+        );
 
-        if (warp_idx_in_warp_group == 0) {
-          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_store_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            consumer_warp_group_idx
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
-                                                                     epi_store_tensormap,
-                                                                     consumer_warp_group_idx);
-        }
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                    epi_store_tensormap,
+                                                                    consumer_warp_group_idx);
       }
 
-      while (work_tile_info.is_valid()) {
+      do {
         if constexpr (IsGroupedGemmKernel) {
           problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
         }
@@ -823,9 +978,6 @@ public:
         // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
         auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
 
-        static_assert(cute::is_any_of_v<TileScheduler,
-            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
-            detail::PersistentTileSchedulerSm90>);
         if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
 
           math_wg_order_barrier.wait();
@@ -889,8 +1041,11 @@ public:
         }
 
         // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
         work_tile_info = next_work_tile_info;
+        if (increment_pipe) {
+          ++tile_scheduler_pipe_consumer_state;
+        }
 
         // Skip a tile for pingpong
         if (work_tile_info.is_valid()) {
@@ -901,10 +1056,11 @@ public:
           mainloop_pipe_consumer_state.advance(work_k_tile_count);
 
           // Go to next tile
-          auto [next_next_work_tile_info, next_increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-
-          work_tile_info = next_next_work_tile_info;
-          increment_pipe = next_increment_pipe;
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
         }
 
         did_batch_change = curr_batch != work_tile_info.L_idx;
@@ -951,7 +1107,7 @@ public:
         // Cue for next Math WG's Epilogue to start
         math_wg_order_barrier.arrive();
 
-      } // Scheduler work fetch loop
+      } while (work_tile_info.is_valid()); // Scheduler work fetch loop
     } // Consumer Warp Groups End
 #endif
   }
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
index f00a69bb..5bdaba15 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -270,7 +270,7 @@ public:
     using namespace cute;
     using X = Underscore;
 
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200))
 #  define ENABLE_SM90_KERNEL_LEVEL 1
 #endif
 // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index c781d2fe..727bc9c0 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -341,7 +341,7 @@ public:
     using namespace cute;
     using X = Underscore;
 
-#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL))
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200))
 #  define ENABLE_SM90_KERNEL_LEVEL 1
 #endif
 // Any Tensor Op MMA Atom in the ISA is arch conditional.
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
index 587456d2..029f529b 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -139,8 +139,8 @@ public:
 
   /// Register requirement for Load and Math WGs
   static constexpr int RegsPerThread =
-    size<0>(TileShape{}) * size<1>(TileShape{}) / NumMMAThreads *
-    sizeof(ElementAccumulator) / sizeof(uint32_t);
+    (size<0>(TileShape{}) * size<1>(TileShape{}) * sizeof(ElementAccumulator))
+    / (NumMMAThreads * sizeof(uint32_t));
   static constexpr bool HeavyRegisterPressure = RegsPerThread >= 208;
   static constexpr uint32_t LoadRegisterRequirement = !HeavyRegisterPressure ? 40 : 24;
   static constexpr uint32_t MmaRegisterRequirement = !HeavyRegisterPressure ? 232 : 240;
@@ -352,7 +352,7 @@ public:
     using namespace cute;
     using X = Underscore;
 
-#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL))
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200))
 #  define ENABLE_SM90_KERNEL_LEVEL 1
 #endif
 // Any Tensor Op MMA Atom in the ISA is arch conditional.
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
index e3f3a22f..d896c069 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -43,7 +43,7 @@ namespace cutlass::gemm::kernel::detail {
 ///////////////////////////////////////////////////////////////////////////////
 
 // Persistent Thread Block (TB) scheduler
-template <class GroupProblemShape>
+template <class GroupProblemShape, int SchedulerPipelineStageCount>
 class PersistentTileSchedulerSm90Group {
   //
   // Data members
@@ -58,6 +58,7 @@ private:
     int group_idx = 0;
     uint64_t start_linear_idx = 0;
     uint64_t total_tiles = 0;
+    uint64_t problem_blocks_along_raster_order = 0;
   } current_group_info_;
 
 public:
@@ -98,19 +99,34 @@ public:
   using RasterOrderOptions = typename Params::RasterOrderOptions;
   static constexpr bool IsDynamicPersistent = false;
 
-  using Pipeline = PipelineEmpty;
-  using PipelineStorage = typename Pipeline::SharedStorage;
+  // We need to hard code the number of stages here since the scheduling is static
+  // and it can benefit from a larger number of stages without worrying about imbalances.
+
+  using Pipeline = PipelineAsync<SchedulerPipelineStageCount>;
+
+  // Call out the types here to work around a bug in MSVC.
+
+  // using PipelineStorage = typename Pipeline::SharedStorage;
+  // using PipelineState = typename Pipeline::PipelineState;
+  using PipelineStorage = cutlass::PipelineDetail::PipelineAsyncSharedStorage<SchedulerPipelineStageCount>;
+  using PipelineState = cutlass::PipelineDetail::PipelineAsyncPipelineState<SchedulerPipelineStageCount>;
+
   using ThrottlePipeline = PipelineEmpty;
-  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
-  struct CLCResponse {};
+  using ThrottlePipelineStorage = typename PipelineEmpty::SharedStorage;
+  using SchedulerResponse = WorkTileInfo;
 
   class SharedStorage {
   public:
-    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE PipelineStorage pipeline() { return pipeline_; }
+    // Pipeline throttle is not needed here as the scheduling is not dynamic.
     CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
-    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+    CUTLASS_DEVICE SchedulerResponse* data() { return data_; }
+
+  private: 
+    alignas(16) PipelineStorage pipeline_;
+    alignas(16) SchedulerResponse data_[SchedulerPipelineStageCount];
   };
-  
+
   struct Arguments {
     int max_swizzle_size = 1;
     // Not applying Heuristics for Grouped problems, since largest dimension can change per group
@@ -119,6 +135,8 @@ public:
 
   // Sink scheduler params as a member
   Params scheduler_params;
+  SchedulerResponse *response_ptr_ = nullptr;
+  ProblemShape cached_problem_shapes_[2];
 
   //
   // Methods
@@ -229,7 +247,7 @@ public:
 
   PersistentTileSchedulerSm90Group() = default;
 
-  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_) : scheduler_params(params_) {
+  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_, SchedulerResponse* response_ptr) : scheduler_params(params_), response_ptr_(response_ptr) {
     // MSVC requires protecting use of CUDA-specific nonstandard syntax,
     // like blockIdx and gridDim, with __CUDA_ARCH__.
 #if defined(__CUDA_ARCH__)
@@ -240,8 +258,12 @@ public:
       current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
     }
 
-    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+    int lane_idx = canonical_lane_idx();
+    if (lane_idx < params_.groups_) {
+      cached_problem_shapes_[1] = params_.problem_shapes_[lane_idx];
+    }
 
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
     uint64_t ctas_along_m, ctas_along_n;
     if (is_tuple<decltype(cute::shape<0>(params_.problem_shapes_[0]))>::value ||
         is_tuple<decltype(cute::shape<1>(params_.problem_shapes_[0]))>::value) {
@@ -255,52 +277,24 @@ public:
     auto problem_blocks_m = round_up(ctas_along_m, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.m());
     auto problem_blocks_n = round_up(ctas_along_n, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.n());
     current_group_info_.total_tiles = problem_blocks_m * problem_blocks_n;
+    current_group_info_.problem_blocks_along_raster_order = params_.raster_order_ == RasterOrder::AlongN ? problem_blocks_n : problem_blocks_m;
+
 #else
     CUTLASS_ASSERT(false && "This line should never be reached");
 #endif
   }
 
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work() {
-    return get_current_work_for_linear_idx(current_work_linear_idx_);
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work_for_linear_idx(uint64_t linear_idx) {
-    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
-      return WorkTileInfo::invalid_work_tile();
-    }
-
-    return get_work_idx_m_and_n(linear_idx,
-                                current_group_info_,
-                                scheduler_params.groups_,
-                                scheduler_params.problem_shapes_,
-                                scheduler_params.cta_shape_,
-                                scheduler_params.cluster_shape_,
-                                scheduler_params.divmod_cluster_shape_major_,
-                                scheduler_params.divmod_cluster_shape_minor_,
-                                scheduler_params.divmod_cta_shape_m_,
-                                scheduler_params.divmod_cta_shape_n_,
-                                scheduler_params.log_swizzle_size_, 
-                                scheduler_params.raster_order_);
-  }
-
-  CUTLASS_DEVICE
-  void
-  advance_to_next_work(uint32_t advance_count = 1) {
-    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
-  }
-
   // get work_idx_m, work_idx_n from linear_idx while applying swizzle
-  static CUTLASS_DEVICE
+  template<class WorkTileInfo, class GroupInfo, class ProblemShape, class RasterOrder>
+  static
+  CUTLASS_DEVICE
   WorkTileInfo
   get_work_idx_m_and_n(
       uint64_t linear_idx,
-      struct GroupInfo& group_info,
+      GroupInfo& group_info,
       int32_t total_problem_groups,
       ProblemShape* problem_shapes,
+      ProblemShape (&cached_problem_shapes)[2],
       GemmCoord cta_shape,
       GemmCoord cluster_shape,
       FastDivmodU64Pow2 const& divmod_cluster_shape_major,
@@ -311,39 +305,66 @@ public:
       RasterOrder raster_order) {
 
     bool valid_tile = true;
-    uint64_t ctas_along_m, ctas_along_n;
-    if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
-        is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
-      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
-      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
-    }
-    else {
-      ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
-      ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
-    }
-    auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
-    group_info.total_tiles = problem_blocks_m * problem_blocks_n;
 
-    while (group_info.start_linear_idx + group_info.total_tiles <= linear_idx) {
-      group_info.group_idx++;
+    // Use a warp to "speculatively" check if the work tile maps to the next 32 groups
+    int lane_idx = canonical_lane_idx();
 
-      if (group_info.group_idx >= total_problem_groups)
-        return WorkTileInfo::invalid_work_tile();
+    if (linear_idx >= group_info.total_tiles + group_info.start_linear_idx) {
+      group_info.group_idx += lane_idx;
+      for ( ; ; group_info.group_idx += NumThreadsPerWarp) {
+        cached_problem_shapes[0] = cached_problem_shapes[1];
+        if (group_info.group_idx + NumThreadsPerWarp < total_problem_groups) {
+          cached_problem_shapes[1] = problem_shapes[group_info.group_idx + NumThreadsPerWarp];
+        }
+        if (group_info.group_idx < total_problem_groups) {
+          uint64_t ctas_along_m, ctas_along_n;
+          if (is_tuple<decltype(cute::shape<0>(cached_problem_shapes[0]))>::value ||
+              is_tuple<decltype(cute::shape<1>(cached_problem_shapes[0]))>::value) {
+            ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(cached_problem_shapes[0]), cta_shape.m()));
+            ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(cached_problem_shapes[0]), cta_shape.n()));
+          }
+          else {
+            ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(cached_problem_shapes[0]) +  divmod_cta_shape_m.divisor - 1);
+            ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(cached_problem_shapes[0]) +  divmod_cta_shape_n.divisor - 1);
+          }
+          auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+          auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+          group_info.problem_blocks_along_raster_order = raster_order == RasterOrder::AlongN ? problem_blocks_n : problem_blocks_m;
+          group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+        } else {
+          group_info.total_tiles = INT_MAX;
+        }
 
-      group_info.start_linear_idx += group_info.total_tiles;
-      if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
-          is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
-        ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
-        ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
+        auto curr_total_tiles = group_info.total_tiles;
+
+        // Calculate prefix sum for start_linear_idx.
+        #pragma unroll
+        for (int i = 1; i < NumThreadsPerWarp; i *= 2) {
+          auto n = __shfl_up_sync(0xffffffff, curr_total_tiles, i);
+          curr_total_tiles = lane_idx >= i ? curr_total_tiles + n : curr_total_tiles;
+        }
+        group_info.start_linear_idx += curr_total_tiles - group_info.total_tiles;
+
+        uint32_t thread_succeed = __ballot_sync(0xffffffff, linear_idx < group_info.start_linear_idx + group_info.total_tiles);
+        if (thread_succeed) {
+          // Use the first succeeding thread.
+          int first_succeeding_thread = __ffs(thread_succeed) - 1;
+          group_info.group_idx = __shfl_sync(0xffffffff, group_info.group_idx, first_succeeding_thread);
+          group_info.start_linear_idx = __shfl_sync(0xffffffff, group_info.start_linear_idx, first_succeeding_thread);
+          group_info.total_tiles = __shfl_sync(0xffffffff, group_info.total_tiles, first_succeeding_thread);
+          group_info.problem_blocks_along_raster_order = __shfl_sync(0xffffffff, group_info.problem_blocks_along_raster_order, first_succeeding_thread);
+          if (group_info.group_idx + lane_idx < total_problem_groups) {
+            cached_problem_shapes[1] = problem_shapes[group_info.group_idx + lane_idx];
+          }
+          break;
+        }
+        // Update the start_linear_idx for all threads so that they're ready for the next iteration.
+        group_info.start_linear_idx = __shfl_sync(0xffffffff, group_info.start_linear_idx + group_info.total_tiles, NumThreadsPerWarp - 1);
       }
-      else {
-        ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
-        ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
-      }
-      problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
-      problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
-      group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+    }
+
+    if (group_info.group_idx >= total_problem_groups) {
+      return WorkTileInfo::invalid_work_tile();
     }
 
     uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
@@ -369,13 +390,8 @@ public:
     offset = cluster_id & ((1 << log_swizzle_size) - 1);
     extra = cluster_id >> log_swizzle_size;
 
-    uint64_t curr_group_cluster_blk_major;
-    if (raster_order == RasterOrder::AlongN) {
-      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_n);
-    }
-    else {
-      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_m);
-    }
+    uint64_t curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(group_info.problem_blocks_along_raster_order);
+
     cluster_idx_minor_div_swizzle = extra / curr_group_cluster_blk_major;
     cluster_idx_major = extra % curr_group_cluster_blk_major;
 
@@ -392,7 +408,46 @@ public:
     else {
       return {major_work_idx, minor_work_idx, group_info.group_idx, valid_tile}; 
     }
+  }
 
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) {
+    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+    return get_work_idx_m_and_n<WorkTileInfo>(
+              linear_idx,
+              current_group_info_,
+              scheduler_params.groups_,
+              scheduler_params.problem_shapes_,
+              cached_problem_shapes_,
+              scheduler_params.cta_shape_,
+              scheduler_params.cluster_shape_,
+              scheduler_params.divmod_cluster_shape_major_,
+              scheduler_params.divmod_cluster_shape_minor_,
+              scheduler_params.divmod_cta_shape_m_,
+              scheduler_params.divmod_cta_shape_n_,
+              scheduler_params.log_swizzle_size_, 
+              scheduler_params.raster_order_);
+  }
+  template <typename TileSchedulerPipeline, typename TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  advance_to_next_work(
+    TileSchedulerPipeline& scheduler_pipeline,
+    TileSchedulerPipelineState scheduler_pipe_producer_state,
+    uint32_t advance_count = 1) {
+
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+    auto work_tile = get_current_work_for_linear_idx(current_work_linear_idx_);
+    scheduler_pipeline.producer_acquire(scheduler_pipe_producer_state);
+    if (cute::elect_one_sync()) {
+      response_ptr_[scheduler_pipe_producer_state.index()] = work_tile;
+      cutlass::arch::fence_view_async_shared();
+      scheduler_pipeline.producer_commit(scheduler_pipe_producer_state);
+    }
+    return cute::make_tuple(work_tile, true);
   }
 
   // Returns whether the block assigned this work should compute the epilogue for the corresponding
@@ -503,25 +558,32 @@ public:
   }
 
   // Kernel helper function to get next work tile
+  template <typename TileSchedulerPipeline, typename TileSchedulerPipelineState>
   CUTLASS_DEVICE
   auto
-  fetch_next_work(WorkTileInfo work_tile_info) {
+  fetch_next_work(
+    WorkTileInfo work_tile_info,
+    TileSchedulerPipeline& scheduler_pipeline,
+    TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+
     if (continue_current_work(work_tile_info)) {
       return cute::make_tuple(work_tile_info, true);
     }
+    scheduler_pipeline.consumer_wait(scheduler_pipe_consumer_state);
+    auto work_tile = response_ptr_[scheduler_pipe_consumer_state.index()];
+    cutlass::arch::fence_view_async_shared();
+    scheduler_pipeline.consumer_release(scheduler_pipe_consumer_state);
 
-    advance_to_next_work();
-    return cute::make_tuple(get_current_work(), true);
+    return cute::make_tuple(work_tile, true);
   }
   
   // Returns the initial work tile info that will be computed over
   template <class ClusterShape>
   CUTLASS_DEVICE
-  WorkTileInfo
+  auto
   initial_work_tile_info(ClusterShape) {
-    return get_current_work();
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
   }
-
 };
 
 } // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index ce4cb674..aa7bd0dc 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -172,10 +172,9 @@ struct TileSchedulerSelector<
     , SchedulerPipelineStageCount              
     , GroupProblemShape
   > {
-  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
 };
 
-
 template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
 struct TileSchedulerSelector<
     PersistentScheduler,
@@ -255,7 +254,7 @@ struct TileSchedulerSelector<
     SchedulerPipelineStageCount,
     GroupProblemShape
   > {
-  using Scheduler = PersistentTileSchedulerSm100Group<GroupProblemShape>;
+  using Scheduler = PersistentTileSchedulerSm100Group<GroupProblemShape, SchedulerPipelineStageCount>;
 };
 
 // SM100 stream-K scheduler
@@ -339,6 +338,24 @@ struct TileSchedulerSelector<
                         SchedulerPipelineStageCount>;
 };
 
+// SM120 Group tile scheduler
+template <
+  class TileShape,
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount, 
+  class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
index 5d12d2ef..0a837bed 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -1097,6 +1097,10 @@ struct PersistentTileSchedulerSm90StreamKParams {
         return 0;
       }
     }
+    // Ensure that the number of SK tiles is divisible by cluster size so that it can be evenly
+    // divided among SK clusters.
+    sk_tiles = (sk_tiles / cluster_size) * cluster_size;
+
     return static_cast<uint32_t>(sk_tiles);
   }
 
diff --git a/include/cutlass/pipeline/sm100_pipeline.hpp b/include/cutlass/pipeline/sm100_pipeline.hpp
index 44b9d4d4..3dba8dda 100644
--- a/include/cutlass/pipeline/sm100_pipeline.hpp
+++ b/include/cutlass/pipeline/sm100_pipeline.hpp
@@ -1069,6 +1069,10 @@ public:
   struct Params {};
   struct SharedStorage {};
 
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineEmpty(SharedStorage& storage, Params const& params) {}
+
   // Constructor
   CUTLASS_DEVICE
   PipelineEmpty(SharedStorage&& storage, Params const& params) {}
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index c253f7ff..0828c1ea 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -421,7 +421,12 @@ public:
   }
 
   CUTLASS_DEVICE
-  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+  void producer_acquire(PipelineState state) {
+    producer_acquire(state.index(), state.phase());
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token) {
     producer_acquire(state.index(), state.phase(), barrier_token);
   }
 
@@ -502,6 +507,25 @@ private:
     return {static_cast<BarrierStatus>(barrier_status)};
   }
 
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase) {
+    empty_barrier_ptr_[stage].wait(phase);
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+
+    // Most likely you have elected more than one leader
+    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
   CUTLASS_DEVICE
   void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
     detail::pipeline_check_is_producer(params_.role);
diff --git a/media/docs/cpp/overview.md b/media/docs/cpp/overview.md
index b696686a..35d2aac1 100644
--- a/media/docs/cpp/overview.md
+++ b/media/docs/cpp/overview.md
@@ -44,22 +44,22 @@ architecture.
 
 * Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
   - Collective mainloops that target for:
-    * [Blockscaled datatypes with support for dense GEMM](../../../include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
-    * [Blockscaled datatypes with support for sparse GEMM](../../../include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
-  - New [GEMM](../../../include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](../../../include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell SM120 epilogue](../../../include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](../../../include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+    * [Blockscaled datatypes with support for dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+    * [Blockscaled datatypes with support for sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+  - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+  - [Blackwell SM120 epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
 * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
-  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](../../../examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
-  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](../../../examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
-  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](../../../examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
-* Set of unit tests that demonstrate the usage of both [sparse](../../../test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](../../../test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
 * Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
-  - Enhancement of [blockwise GEMM](../../../examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
-  - Enhancement of [groupwise GEMM](../../../examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
-  - Support for [grouped GEMM with blockwise and groupwise scaling](../../../examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
-  - Support for [blockwise GEMM](../../../examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
-  - Support for [groupwise GEMM](../../../examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
-  - Support for [grouped GEMM with blockwise](../../../examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](../../../examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
+  - Enhancement of [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+  - Enhancement of [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+  - Support for [grouped GEMM with blockwise and groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+  - Support for [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+  - Support for [grouped GEMM with blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
 * Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler:
   - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
   - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
diff --git a/python/cutlass/library_defaults.py b/python/cutlass/library_defaults.py
index f2c18c86..aab798b1 100644
--- a/python/cutlass/library_defaults.py
+++ b/python/cutlass/library_defaults.py
@@ -568,6 +568,9 @@ class OptionRegistry:
     def __init__(self, target_cc: int):
         self.registry = {}
 
+        if target_cc > 90:
+            raise Exception(f"Unsupported compute capability {target_cc}. The CUTLASS Python interface only supports compute capabilities up to 90.")
+
         gemm_kinds = [cutlass_library.GemmKind.Universal, cutlass_library.GemmKind.Universal3x]
         operation_kinds = [cutlass_library.OperationKind.Gemm, cutlass_library.OperationKind.Conv2d]
         # Construct options for each CC
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 6888a40a..54acee63 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -65,7 +65,8 @@ class GemmOperation:
       epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
       kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
       tile_scheduler = TileSchedulerType.Default, mixed_input_mode = None, mixed_input_shuffle = False,
-      ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None):
+      ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None, 
+      ScaleFactorMVecSize = None, ScaleFactorNVecSize = None, ScaleFactorKVecSize = None):
 
     kinds_3x = {
       GemmKind.Universal3x,
@@ -73,6 +74,8 @@ class GemmOperation:
       GemmKind.BlockScaledUniversal3x, 
       GemmKind.GroupedUniversal3x,
       GemmKind.GroupedBlockScaledUniversal3x,
+      GemmKind.BlockwiseUniversal3x,
+      GemmKind.GroupedBlockwiseUniversal3x,
     }
     self.is_3x = gemm_kind in kinds_3x
     self.prefix = "3x" if self.is_3x else ""
@@ -91,6 +94,11 @@ class GemmOperation:
       self.ScaleFactorD = ScaleFactorD["tensor"]
       self.ScaleFactorVectorSize = ScaleFactorD["vector_size"]
 
+    if is_blockwise(gemm_kind):
+      self.ScaleFactorMVecSize = ScaleFactorMVecSize
+      self.ScaleFactorNVecSize = ScaleFactorNVecSize
+      self.ScaleFactorKVecSize = ScaleFactorKVecSize
+
     if self.D == None:
       self.D = self.C
 
@@ -191,6 +199,8 @@ class GemmOperation:
   # Generates a string representing the MMA instruction.
   def extended_name(self):
     ''' Append data types if they differ from compute type. '''
+    element_sfa = ""
+    element_sfb = ""
     if self.is_complex():
       extended_name = "${core_name}"
     else:
@@ -198,6 +208,10 @@ class GemmOperation:
         extended_name = "${core_name}_${element_a}_${element_b}"
         if self.C.element != self.tile_description.math_instruction.element_accumulator:
           extended_name = "${element_c}_" + extended_name
+      elif is_blockwise(self.gemm_kind):
+        extended_name = "${core_name}_${element_sfa}x${element_a}_${element_sfb}x${element_b}"
+        element_sfa = DataTypeNames[self.accumulator_type()]
+        element_sfb = DataTypeNames[self.accumulator_type()]
       else:
         extended_name = "${core_name}"
         if self.C.element != self.tile_description.math_instruction.element_accumulator:
@@ -207,7 +221,9 @@ class GemmOperation:
 
     extended_name = SubstituteTemplate(extended_name, {
       'element_a': DataTypeNames[self.A.element],
+      'element_sfa' : element_sfa,
       'element_b': DataTypeNames[self.B.element],
+      'element_sfb' : element_sfb,
       'element_c': DataTypeNames[self.C.element],
       'core_name': self.core_name()
       })
@@ -252,6 +268,22 @@ class GemmOperation:
         element_d = d_type_names,
         core_name = self.core_name())
 
+    if is_blockwise(self.gemm_kind):
+      d_type_names = DataTypeNames[self.D.element]
+
+      extended_name = "{core_name}_{sfvec_m_size}x{sfvec_k_size}{element_sfa}x{element_a}_{sfvec_n_size}x{sfvec_k_size}{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
+        element_sfa = DataTypeNames[self.accumulator_type()],
+        element_a = DataTypeNames[self.A.element],
+        element_sfb = DataTypeNames[self.accumulator_type()],
+        element_b = DataTypeNames[self.B.element],
+        element_acc = DataTypeNames[self.accumulator_type()],
+        element_c = DataTypeNames[self.C.element],
+        element_d = d_type_names,
+        sfvec_m_size = self.ScaleFactorMVecSize,
+        sfvec_n_size = self.ScaleFactorNVecSize,
+        sfvec_k_size = self.ScaleFactorKVecSize,
+        core_name = self.core_name())
+
     if self.mixed_input_mode != None:
       extended_name = extended_name + self.mixed_input_mode_name()
     return extended_name
@@ -761,6 +793,7 @@ class EmitGemmUniversal3xInstance:
       "cutlass/gemm/kernel/gemm_universal.hpp",
       "cutlass/gemm/collective/collective_builder.hpp",
       "cutlass/epilogue/collective/collective_builder.hpp",
+      "cutlass/detail/blockwise_scale_layout.hpp",
     ]
     self.builtin_epilogue_functor_template = \
 """${epilogue_functor}<
@@ -786,6 +819,7 @@ using ${operation_name}_epilogue =
   >::CollectiveOp;
 
 ${mixed_dtype_prepare_code}
+${blockwise_prepare_code}
 
 using ${operation_name}_mainloop =
   typename cutlass::gemm::collective::CollectiveBuilder<
@@ -853,6 +887,18 @@ ${compile_guard_end}
   def pointerize_if_grouped(operation, layout):
     return layout if not is_grouped(operation.gemm_kind) else layout + "* "
 
+  @staticmethod
+  def transform_layout_A_if_blockwise(operation, layout):
+    layout_sfa = f"{operation.procedural_name()}_LayoutSFA"
+    layout_sfa = layout_sfa if not is_grouped(operation.gemm_kind) else layout_sfa + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfa}>"
+
+  @staticmethod
+  def transform_layout_B_if_blockwise(operation, layout):
+    layout_sfb = f"{operation.procedural_name()}_LayoutSFB"
+    layout_sfb = layout_sfb if not is_grouped(operation.gemm_kind) else layout_sfb + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfb}>"
+
   @staticmethod
   def problem_shape(operation):
     gemm_shape_type = "cute::Shape<int,int,int,int>"
@@ -1017,14 +1063,25 @@ using {operation_name_str}_LayoutNarrowReordered = decltype(cute::tile_to_shape(
       else:
         element_b = narrow_element
 
+    blockwise_prepare_code = ""
+    if is_blockwise(operation.gemm_kind):
+      sfm_vec_size = operation.ScaleFactorMVecSize
+      sfn_vec_size = operation.ScaleFactorNVecSize
+      sfk_vec_size = operation.ScaleFactorKVecSize
+      blockwise_prepare_code = f"""
+using {operation_name_str}_ScaleConfig = cutlass::detail::Sm{operation.arch}BlockwiseScaleConfig<{sfm_vec_size}, {sfn_vec_size}, {sfk_vec_size}>;
+using {operation_name_str}_LayoutSFA = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFA());
+using {operation_name_str}_LayoutSFB = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFB());
+      """
+
     values = {
       'operation_name': operation_name_str,
       'operation_suffix': self.operation_suffix,
       'problem_shape': self.problem_shape(operation),
       'element_a': element_a,
-      'layout_a': self.pointerize_if_grouped(operation, layout_a_str),
+      'layout_a': self.transform_layout_A_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_a_str)),
       'element_b': element_b,
-      'layout_b': self.pointerize_if_grouped(operation, layout_b_str),
+      'layout_b': self.transform_layout_B_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_b_str)),
       'element_c': DataTypeTag[operation.C.element],
       'layout_c': self.pointerize_if_grouped(operation, LayoutTag[instance_layout_C]),
       'element_d': DataTypeTag[operation.D.element],
@@ -1057,7 +1114,8 @@ using {operation_name_str}_LayoutNarrowReordered = decltype(cute::tile_to_shape(
       'epilogue_vector_length': str(epilogue_vector_length),
       'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
       'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler]),
-      'mixed_dtype_prepare_code': mixed_dtype_prepare_code
+      'mixed_dtype_prepare_code': mixed_dtype_prepare_code,
+      'blockwise_prepare_code' : blockwise_prepare_code
     }
 
     return SubstituteTemplate(self.gemm_template, values)
@@ -1387,6 +1445,8 @@ class EmitGemmConfigurationLibrary:
       GemmKind.Grouped: EmitGemmGroupedInstance,
       GemmKind.GroupedUniversal3x: EmitGemmUniversal3xInstance,
       GemmKind.GroupedBlockScaledUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.BlockwiseUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockwiseUniversal3x: EmitGemmUniversal3xInstance,
     }
 
     self.gemm_kind_wrappers = {
@@ -1401,6 +1461,8 @@ class EmitGemmConfigurationLibrary:
       GemmKind.Grouped: 'GemmGroupedOperation',
       GemmKind.GroupedUniversal3x: 'GroupedGemmUniversal3xOperation',
       GemmKind.GroupedBlockScaledUniversal3x: 'GroupedBlockScaledGemmUniversal3xOperation',
+      GemmKind.BlockwiseUniversal3x: 'BlockwiseGemmUniversal3xOperation',
+      GemmKind.GroupedBlockwiseUniversal3x: 'GroupedBlockwiseGemmUniversal3xOperation',
     }
 
     self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)"
@@ -1460,6 +1522,7 @@ void initialize_${configuration_name}(Manifest &manifest) {
       ("grouped_gemm_operation_3x.hpp", None),
       ("sparse_gemm_operation_3x.hpp", None),
       ("block_scaled_gemm_operation_3x.hpp", None),   
+      ("blockwise_gemm_operation_3x.hpp", None),   
       ("cutlass/arch/wmma.h", None),
       ("cutlass/numeric_types.h", None)
     ])
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index 4ae65ed0..20f8e828 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -219,6 +219,15 @@ def CreateGemmUniversal3xOperator(
       gemm_op_extra_args["ScaleFactorD"] = { "tensor": TensorDescription(data_type["sfd_type"]["type"], data_type["sfd_type"]["layout"]),
                                              "vector_size" : data_type["sfd_type"]["vector_size"]}
       assert is_block_scaled(gemm_kind)
+    
+    if tile_description.explicit_vector_sizes != None:
+      assert len(tile_description.explicit_vector_sizes) == 3
+      gemm_op_extra_args["ScaleFactorMVecSize"] = tile_description.explicit_vector_sizes[0]
+      gemm_op_extra_args["ScaleFactorNVecSize"] = tile_description.explicit_vector_sizes[1]
+      gemm_op_extra_args["ScaleFactorKVecSize"] = tile_description.explicit_vector_sizes[2]
+      assert is_blockwise(gemm_kind)
+    else:
+      assert not is_blockwise(gemm_kind)
 
     A_dtype = data_type["a_type"]
     B_dtype = data_type["b_type"]
@@ -5811,6 +5820,87 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
                                               stream_k_schedules,
                                               tile_schedulers=[TileSchedulerType.StreamK])
 
+def GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.BlockwiseUniversal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 3 if is_grouped(gemm_kind) else 0):
+    return
+
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions_ = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  tile_descriptions = list()
+
+  for desc in tile_descriptions_:
+    desc.explicit_vector_sizes = [1, desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [desc.tile_shape[0], desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [desc.tile_shape[0], desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [1, 1, desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+              gemm_kind=gemm_kind,
+              enable_fp8_fast_acc=False,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules, gemm_kind=gemm_kind)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK],
+                                              gemm_kind=gemm_kind)
+
+
 
 def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@@ -7499,6 +7589,245 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
       [[kernel_schedule, epi_schedule]], tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
 
+def GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.BlockwiseUniversal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  grouped = is_grouped(gemm_kind)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]], 
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+  ]
+
+  min_cc = 100
+  max_cc = 100
+  epi_type = DataType.f32
+
+  math_instructions_1sm = [
+    # inst 64x128
+    MathInstruction(
+      [64, 128, 32],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    # inst 128x32
+    MathInstruction(
+      [128, 32, 32],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 32, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 32, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 32, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    # inst 128x64
+    MathInstruction(
+      [128, 64, 32],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 64, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 64, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 64, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    # inst 128x128
+    MathInstruction(
+      [128, 128, 32],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 128, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 128, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 128, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    # inst 128x256
+    MathInstruction(
+      [128, 256, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add)]
+
+  cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1], [4,4,1]
+                        , DynamicClusterShape
+                       ]
+
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [math_inst.instruction_shape[0], math_inst.instruction_shape[1], 
+           math_inst.instruction_shape[2] * 4]))
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [1, math_inst.instruction_shape[1], 
+           math_inst.instruction_shape[2] * 4]))
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [math_inst.instruction_shape[0], 1, 
+           math_inst.instruction_shape[2] * 4]))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      is_runtime_datatype_a = is_runtime_datatype(data_type["a_type"])
+      is_runtime_datatype_b = is_runtime_datatype(data_type["d_type"])
+
+      # A/B datatypes should be both static or dynamic
+      if (is_runtime_datatype_a != is_runtime_datatype_b):
+        continue
+
+      # grouped GEMM does not support runtime data type yet
+      if grouped and (is_runtime_datatype_a or is_runtime_datatype_b):
+        continue
+      kernel_schedule = to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100, grouped)
+      epi_schedule = to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[kernel_schedule, epi_schedule]],
+        tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
 def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
   # SM100 MMA with mixed F4/F6/F8 inputs + without block scale
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@@ -10318,6 +10647,11 @@ def GenerateSM100(manifest, cuda_version):
 
   # StreamK is included in regular generation
   GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
+
+  # Blockwise kernels
+  GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockwiseUniversal3x)
+
   #
   # Sparse Gemm
   #
@@ -10755,6 +11089,8 @@ def GenerateSM90(manifest, cuda_version):
   GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
   GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version)
   GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockwiseUniversal3x)
 
 ###################################################################################################
 
@@ -10819,6 +11155,8 @@ if __name__ == "__main__":
 
   manifest = Manifest(args)
 
+  archs = args.architectures.split(';')
+
   GenerateSM50(manifest, args.cuda_version)
   GenerateSM60(manifest, args.cuda_version)
   GenerateSM61(manifest, args.cuda_version)
@@ -10827,8 +11165,8 @@ if __name__ == "__main__":
   GenerateSM80(manifest, args.cuda_version)
   GenerateSM89(manifest, args.cuda_version)
   GenerateSM90(manifest, args.cuda_version)
-
-  blackwell_enabled_arch = args.architectures in ["100a", "101a", "120a"]
+   
+  blackwell_enabled_arch = any(arch in ["100a", "101a", "120a"] for arch in archs)
   if blackwell_enabled_arch:
     GenerateSM100(manifest, args.cuda_version)
     GenerateSM120(manifest, args.cuda_version)
diff --git a/python/cutlass_library/library.py b/python/cutlass_library/library.py
index 7e1994c1..8e932cb3 100644
--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@@ -324,8 +324,12 @@ def is_complex(data_type):
 def is_block_scaled(gemm_kind):
   return gemm_kind in (GemmKind.BlockScaledUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
 
+def is_blockwise(gemm_kind):
+  return gemm_kind in (GemmKind.BlockwiseUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
+
 def is_grouped(gemm_kind):
-  return gemm_kind in (GemmKind.GroupedUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
+  return gemm_kind in (GemmKind.GroupedUniversal3x, 
+    GemmKind.GroupedBlockScaledUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
 
 #
 def get_complex_from_real(real_type):
@@ -493,6 +497,9 @@ class KernelScheduleType(enum.Enum):
   PtrArrayTmaWarpSpecializedPingpong = enum_auto()
   PtrArrayTmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
 
+  BlockwiseTmaWarpSpecializedCooperative = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecializedCooperative = enum_auto()
+
   TmaWarpSpecialized1SmSm100 = enum_auto()
   TmaWarpSpecialized2SmSm100 = enum_auto()
   ImplicitTmaWarpSpecialized1SmSm100 = enum_auto()
@@ -518,6 +525,13 @@ class KernelScheduleType(enum.Enum):
   Mxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
   Mxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
 
+  BlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  BlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+
+  PtrArrayBlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+
+
   Mxf4TmaWarpSpecialized1SmSm100 = enum_auto()
   Mxf4TmaWarpSpecialized2SmSm100 = enum_auto()
   Nvf4TmaWarpSpecialized1SmSm100 = enum_auto()
@@ -547,6 +561,8 @@ KernelScheduleTag = {
   KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum',
   KernelScheduleType.ImplicitTmaWarpSpecializedSm90: 'cutlass::conv::KernelImplicitTmaWarpSpecializedSm90',
 
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum',
+
   KernelScheduleType.TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmSm100',
   KernelScheduleType.TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmSm100',
 
@@ -564,6 +580,12 @@ KernelScheduleTag = {
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100',
 
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100',
+
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100',
+
   KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf4Sm100',
   KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf4Sm100',
   KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100',
@@ -574,6 +596,8 @@ KernelScheduleTag = {
   KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong',
   KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum',
 
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum',
+
   KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100",
   KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100",
   KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100",
@@ -608,6 +632,8 @@ KernelScheduleSuffixes = {
   KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
   KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
   KernelScheduleType.ImplicitTmaWarpSpecializedSm90: '_warpspecialized',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
   
   KernelScheduleType.TmaWarpSpecialized1SmSm100: '_1sm',
   KernelScheduleType.TmaWarpSpecialized2SmSm100: '_2sm',
@@ -626,6 +652,11 @@ KernelScheduleSuffixes = {
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: '_q_1sm',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: '_q_2sm',
 
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+
   KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
   KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
   KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
@@ -636,6 +667,8 @@ KernelScheduleSuffixes = {
   KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
   KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
 
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+
   KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: '_1sm',
   KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: '_2sm',
   KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
@@ -730,6 +763,7 @@ def to_grouped_schedule(schedule, grouped):
   group_schedule_map = {
     # SM90
     KernelScheduleType.TmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative,
     KernelScheduleType.TmaWarpSpecializedPingpong    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong,
     KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
     KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum,
@@ -745,6 +779,9 @@ def to_grouped_schedule(schedule, grouped):
     KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100,
     EpilogueScheduleType.TmaWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
     EpilogueScheduleType.TmaWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100,
+
   }
 
   return group_schedule_map[schedule]
@@ -932,6 +969,8 @@ class GemmKind(enum.Enum):
   BlockScaledUniversal3x = enum_auto()                                   
   GroupedUniversal3x = enum_auto()
   GroupedBlockScaledUniversal3x = enum_auto()
+  BlockwiseUniversal3x = enum_auto()
+  GroupedBlockwiseUniversal3x = enum_auto()
 
 #
 GemmKindNames = {
@@ -945,7 +984,9 @@ GemmKindNames = {
   GemmKind.Grouped: "gemm_grouped",
   GemmKind.BlockScaledUniversal3x: "gemm",
   GemmKind.GroupedUniversal3x: "gemm_grouped",
-  GemmKind.GroupedBlockScaledUniversal3x: "gemm_grouped"
+  GemmKind.GroupedBlockScaledUniversal3x: "gemm_grouped",
+  GemmKind.BlockwiseUniversal3x: "gemm",
+  GemmKind.GroupedBlockwiseUniversal3x: "gemm_grouped"
 }
 
 #
@@ -1149,7 +1190,7 @@ class MathInstruction:
 #
 class TileDescription:
 
-  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]):
+  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1], explicit_vector_sizes = None):
     self.threadblock_shape = threadblock_shape
     self.tile_shape = threadblock_shape
     self.stages = stages
@@ -1158,6 +1199,7 @@ class TileDescription:
     self.minimum_compute_capability = min_compute
     self.maximum_compute_capability = max_compute
     self.cluster_shape = cluster_shape
+    self.explicit_vector_sizes = explicit_vector_sizes
 
   def procedural_name(self):
     if self.minimum_compute_capability >= 90:
diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py
index 79895305..63ff6f1f 100644
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -511,16 +511,23 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
                 return [], []
             if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
                 schedules = []
-                schedules.append(
-                    [
-                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
-                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
-                    ])
-                schedules.append(
-                    [
-                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
-                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
-                    ])
+                if is_blockwise(gemm_kind):
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                else:
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
                 return schedules, []
             return [], []
 
@@ -547,26 +554,34 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
         epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
         if a_type_size > b_type_size:
             epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
-        schedules.append([
-            KernelScheduleType.TmaWarpSpecialized,
-            epilogue_schedule
-        ])
-        schedules.append([
-            KernelScheduleType.TmaWarpSpecializedPingpong,
-            epilogue_schedule
-        ])
+        
+        if not is_blockwise(gemm_kind):
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecialized,
+                epilogue_schedule
+            ])
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecializedPingpong,
+                epilogue_schedule
+            ])
         if cta_m >= 128:
             if a_type_size > b_type_size:
                 epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
             else:
                 epilogue_schedule = EpilogueScheduleType.TmaWarpSpecializedCooperative
-            schedules.append([
-                KernelScheduleType.TmaWarpSpecializedCooperative,
-                epilogue_schedule
-            ])
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+            else:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
         return schedules, []
 
-    if not is_aligned:
+    if not is_aligned and not is_blockwise(gemm_kind):
         schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,
                     default_epilogue]]
         stream_k_schedules = []
@@ -585,7 +600,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
 
     schedules = []
     # Pruning: emit Void-C and Grouped kernels with persistent kernels only
-    if (level >= 1 or not is_void_c) and not grouped:
+    if (level >= 1 or not is_void_c) and not grouped and not is_blockwise(gemm_kind):
         # Pruning: don't stamp out fp8 kernels with auto schedule
         if not is_fp8:
             schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
@@ -596,7 +611,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
         if can_do_tma_epilogue:
             assert not requires_transposed_epilogue
             # Inconsistency: fp8 pingpong only gets stamped out with fast accum
-            if not is_fp8 or level >= 1:
+            if (not is_fp8 or level >= 1) and not is_blockwise(gemm_kind):
                 schedules.append([
                     to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpong, grouped),
                     to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized, grouped)
@@ -618,14 +633,24 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
             schedules.append([to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, grouped), to_grouped_schedule(default_epilogue, grouped)])
 
         if can_do_cooperative:
-            schedules.append([
-                to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
-                to_grouped_schedule(default_epilogue, grouped)
-            ])
-            stream_k_schedules.append([
-                KernelScheduleType.TmaWarpSpecializedCooperative,
-                default_epilogue
-            ])
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            else:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
             if can_do_fp8_fast_accum:
                 schedules.append([
                     to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
@@ -640,14 +665,24 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
         if can_do_tma_epilogue:
             assert not requires_transposed_epilogue
             if can_do_cooperative:
-                schedules.append([
-                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
-                    to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
-                ])
-                stream_k_schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedCooperative,
-                    EpilogueScheduleType.TmaWarpSpecializedCooperative
-                ])
+                if is_blockwise(gemm_kind):
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                else:
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
                 if can_do_fp8_fast_accum:
                     schedules.append([
                         to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
diff --git a/test/unit/cute/core/tuple.cpp b/test/unit/cute/core/tuple.cpp
index ea31edd9..a0f2a312 100644
--- a/test/unit/cute/core/tuple.cpp
+++ b/test/unit/cute/core/tuple.cpp
@@ -510,7 +510,7 @@ void test_sizes_and_not_storing_empty_types() {
 
 } // namespace test
 
-TEST(CuTe_core, PackedTuple2)
+TEST(CuTe_core, PackedTuple)
 {
   CUTLASS_TRACE_HOST("-------------------------------");
   CUTLASS_TRACE_HOST("tuple");
@@ -522,7 +522,7 @@ TEST(CuTe_core, PackedTuple2)
   pt_test::test_sizes_and_not_storing_empty_types();
 }
 
-TEST(CuTe_core, PackedTuple2Get) {
+TEST(CuTe_core, PackedTupleGet) {
   using cute::tuple;
   using pt_test::Empty;
   using pt_test::Nonempty;
@@ -678,6 +678,42 @@ TEST(CuTe_core, PackedTuple2Get) {
   }
 }
 
+TEST(CuTe_core, PackedTupleGetValueCategory) {
+  using cute::tuple;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  tuple<Nonempty<int>, int, Empty<42>> tup(Nonempty<int>{42}, 7, Empty<42>{});
+
+  // Lvalue ref
+  decltype(auto) t0 = cute::get<0>(tup);
+  decltype(auto) t1 = cute::get<1>(tup);
+  decltype(auto) t2 = cute::get<2>(tup);
+
+  EXPECT_TRUE((cute::is_same_v<decltype(t0), Nonempty<int>&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(t1), int&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(t2), Empty<42>>));
+
+  // Const lvalue ref
+  auto const& ctup = tup;
+  decltype(auto) ct0 = cute::get<0>(ctup);
+  decltype(auto) ct1 = cute::get<1>(ctup);
+  decltype(auto) ct2 = cute::get<2>(ctup);
+
+  EXPECT_TRUE((cute::is_same_v<decltype(ct0), Nonempty<int> const&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(ct1), int const&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(ct2), Empty<42>>));
+
+  // Rvalue ref
+  decltype(auto) r0 = cute::get<0>(cute::move(tup));
+  decltype(auto) r1 = cute::get<1>(cute::move(tup));
+  decltype(auto) r2 = cute::get<2>(cute::move(tup));
+
+  EXPECT_TRUE((cute::is_same_v<decltype(r0), Nonempty<int>&&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(r1), int&&>));
+  EXPECT_TRUE((cute::is_same_v<decltype(r2), Empty<42>>));
+}
+
 namespace pt_test {
 
 // An empty class type to which Empty is convertible.
@@ -705,14 +741,14 @@ TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
 
   using pt_test::Empty;
   {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<Empty<0>> eso1{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<int64_t> eso2{};
   }
   {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<Empty<0>, Empty<1>> eso0{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<int64_t, Empty<1>> eso1{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<Empty<0>, int64_t> eso2{};
+    [[maybe_unused]] constexpr cute::eso::ESO_t<int64_t, int64_t> eso3{};
   }
 }
 
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index 311b50c1..cd51ec64 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -92,7 +92,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64
     cutlass::gemm::GemmShape<128, 256, 32>,
     cutlass::gemm::GemmShape<64, 64, 32>,
     cutlass::gemm::GemmShape<16, 8, 8>,
-    cutlass::epilogue::thread::LinearCombinationRelu<
+    cutlass::epilogue::thread::LinearCombination<
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value,
       ElementAccumulator,
@@ -105,7 +105,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64
     kSplitKSerial
   >;
 
-  bool result = test::gemm::device::TestAllGemm<Gemm, true>();
+  bool result = test::gemm::device::TestAllGemm<Gemm, false>();
   EXPECT_TRUE(result);
 }
 
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
index 76d0a704..d2badb8c 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -365,7 +365,6 @@ struct HostCollectiveMainloop {
     //
     // Allocate the GEMM workspace
     //
-
     // for pointer array problem_shapes.groups() is 1
 
     tensors_A.clear();
@@ -551,7 +550,7 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlo
 
   static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
 
-  using ElementSF = typename Gemm::GemmKernel::ElementSF;
+  using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
   using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
   using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
   using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
@@ -619,6 +618,7 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlo
     //
     // Allocate the GEMM workspace
     //
+
     tensors_A.clear();
     tensors_B.clear();
     stride_a_host.clear();
@@ -786,6 +786,56 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlo
   }
 };
 
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>, 
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>, 
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
 
 template<class Gemm>
 struct HostCollectiveDefaultEpilogue {
@@ -1529,6 +1579,12 @@ struct HostCollectiveEpilogue {
         << "\n\nComputed Aux =\n" << tensors_Aux[batch].host_view();
     }
 
+    if constexpr (IsBlockScaleSupported) {
+      file
+        << "\n\nReference SFD =\n" << references_SFD[batch].host_view()
+        << "\n\nComputed SFD =\n" << tensors_SFD[batch].host_view();
+    }
+
     file
     << "\nC =\n" << tensors_C[batch].host_view()
     << "\n\nReference =\n" << references_D[batch].host_view()
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt
index 7ec9e3ff..27b5a00a 100644
--- a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt
@@ -37,6 +37,7 @@ add_custom_target(
   cutlass_test_unit_bs_gemm_device_tensorop_epilogue_fusion_sm120
   cutlass_test_unit_bs_gemm_device_tensorop_sm120
   cutlass_test_unit_bs_gemm_device_tensorop_sm120_stream_k
+  cutlass_test_unit_bs_grouped_gemm_device_tensorop_sm120
 )
 
 cutlass_test_unit_gemm_device_add_executable(
@@ -67,4 +68,9 @@ cutlass_test_unit_gemm_device_add_executable(
   sm120_bs_gemm_nvf4_nvf4_f32_f32_stream_k.cu
 )
 
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_bs_grouped_gemm_device_tensorop_sm120
+  sm120_bs_gemm_nvf4_nvf4_f32_nvf4_group_gemm_fusion.cu
+)
+
 endif()
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_nvf4_nvf4_f32_nvf4_group_gemm_fusion.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_nvf4_nvf4_f32_nvf4_group_gemm_fusion.cu
new file mode 100644
index 00000000..f4a0071b
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_nvf4_nvf4_f32_nvf4_group_gemm_fusion.cu
@@ -0,0 +1,358 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+    \brief Tests for device-wide grouped GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x_ptr_array.hpp"
+
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+// Pingpong kernel schedule
+TEST(SM120_Device_Gemm_e2m1t_e2m1n_e2m1t_tensorop_f32_epilogue_VS16_group_pingpong, row_sf) {
+  using ElementInput = float_e2m1_t;
+  using ElementA = cutlass::nv_float4_t<ElementInput>;
+  using ElementB = cutlass::nv_float4_t<ElementInput>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementSFD  = ElementSF;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  constexpr int SFVectorSize = 16;
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementInput>::value;
+  constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementInput>::value;  
+  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
+  constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;  
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  constexpr int OutputSFVectorSize = SFVectorSize;
+  // D = alpha * acc + beta * C
+  // With Row-major BlockScaleFactor generation.
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      OutputSFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSFD, GmemLayoutC,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, AlignmentC,
+      ElementD, GmemLayoutC *, AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      FusionOperation
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA *, AlignmentA,
+      ElementB, GmemLayoutB *, AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+
+
+TEST(SM120_Device_Gemm_e2m1t_e2m1n_e2m1t_tensorop_f32_epilogue_VS16_group_pingpong, silu_row_sf) {
+  using ElementInput = float_e2m1_t;
+  using ElementA = cutlass::nv_float4_t<ElementInput>;
+  using ElementB = cutlass::nv_float4_t<ElementInput>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementSFD  = ElementSF;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  constexpr int SFVectorSize = 16;
+  using TileShape_MNK = Shape<_128,_128,_256>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementInput>::value;
+  constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementInput>::value;  
+  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
+  constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;  
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  constexpr int OutputSFVectorSize = SFVectorSize;
+  // D = SiLu(alpha * acc + beta * C)
+  // With Row-major BlockScaleFactor generation.
+  using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor<
+      cutlass::epilogue::thread::SiLu,
+      OutputSFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSFD, GmemLayoutC,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, AlignmentC,
+      ElementD, GmemLayoutC *, AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      FusionOperation
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA *, AlignmentA,
+      ElementB, GmemLayoutB *, AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+
+// Cooperative kenel schedule
+TEST(SM120_Device_Gemm_e2m1t_e2m1n_e2m1t_tensorop_f32_epilogue_VS16_group_cooperative, row_sf) {
+  using ElementInput = float_e2m1_t;
+  using ElementA = cutlass::nv_float4_t<ElementInput>;
+  using ElementB = cutlass::nv_float4_t<ElementInput>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementSFD  = ElementSF;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  constexpr int SFVectorSize = 16;
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementInput>::value;
+  constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementInput>::value;  
+  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
+  constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;  
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  constexpr int OutputSFVectorSize = SFVectorSize;
+  // D = alpha * acc + beta * C
+  // With Row-major BlockScaleFactor generation.
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      OutputSFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSFD, GmemLayoutC,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, AlignmentC,
+      ElementD, GmemLayoutC *, AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      FusionOperation
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA *, AlignmentA,
+      ElementB, GmemLayoutB *, AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+
+
+TEST(SM120_Device_Gemm_e2m1t_e2m1n_e2m1t_tensorop_f32_epilogue_VS16_group_cooperative, silu_row_sf) {
+  using ElementInput = float_e2m1_t;
+  using ElementA = cutlass::nv_float4_t<ElementInput>;
+  using ElementB = cutlass::nv_float4_t<ElementInput>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementSFD  = ElementSF;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  constexpr int SFVectorSize = 16;
+  using TileShape_MNK = Shape<_128,_128,_256>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementInput>::value;
+  constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementInput>::value;  
+  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
+  constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;  
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  constexpr int OutputSFVectorSize = SFVectorSize;
+  // D = SiLu(alpha * acc + beta * C)
+  // With Row-major BlockScaleFactor generation.
+  using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor<
+      cutlass::epilogue::thread::SiLu,
+      OutputSFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSFD, GmemLayoutC,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, AlignmentC,
+      ElementD, GmemLayoutC *, AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      FusionOperation
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA *, AlignmentA,
+      ElementB, GmemLayoutB *, AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+#endif // #if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index febce464..57c584d5 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -233,6 +233,10 @@ cutlass_add_cutlass_library(
   src/reference/gemm_f6_f8_f32.cu
   src/reference/gemm_f8_f4_f32.cu
   src/reference/gemm_f8_f6_f32.cu
+
+  src/reference/blockwise_gemm_fp8_fp16out.cu   
+  src/reference/blockwise_gemm_fp8_fp32out.cu   
+  src/reference/blockwise_gemm_fp8_bf16out.cu   
   
   src/reference/gemm_s8_s8_s32.cu
   src/reference/gemm_u8_u8_s32.cu
diff --git a/tools/library/include/cutlass/library/descriptions.h b/tools/library/include/cutlass/library/descriptions.h
index cddb51c8..5e80c124 100644
--- a/tools/library/include/cutlass/library/descriptions.h
+++ b/tools/library/include/cutlass/library/descriptions.h
@@ -313,10 +313,16 @@ struct BlockScaleDescription {
   TensorDescription SFD;
 
   /// Describes the input ScaleFactor VectorSize
-  int SFVecSize;
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
 
   /// Describes the Output ScaleFactor VectorSize
   int EpilogueSFVecSize;
+
+  /// Describes the underlying kind of scaling: 
+  /// Tensor Core supported (BlockScaled) or manual scaling (Blockwise)
+  OperationKind kind;
 };
 
 struct GroupedGemmDescription : public OperationDescription {
@@ -418,6 +424,96 @@ struct BlockScaledGemmDescription : public OperationDescription {
     transform_B(transform_B) {}
 };
 
+/// Description of all GEMM computations
+struct BlockwiseGemmDescription : public OperationDescription {
+
+  /// Indicates the kind of GEMM performed
+  GemmKind gemm_kind;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source matrix
+  TensorDescription C;
+
+  /// Describes the destination matrix
+  TensorDescription D;
+
+  /// Describes the SFA operand
+  TensorDescription SFA;
+
+  /// Describes the SFB operand
+  TensorDescription SFB;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  /// Describes the input ScaleFactor VectorSize 
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+
+  //
+  // Methods
+  //
+
+  BlockwiseGemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+
+  BlockwiseGemmDescription(
+    OperationDescription op_desc,
+    GemmKind gemm_kind,
+    TensorDescription const& A,
+    TensorDescription const& B,
+    TensorDescription const& C,
+    TensorDescription const& D,
+    NumericTypeID element_epilogue,
+    SplitKMode split_k_mode,
+    ComplexTransform transform_A,
+    ComplexTransform transform_B
+  ):
+    OperationDescription(op_desc),
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {}
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Description for structured sparse GEMMs.
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 9e70e9eb..ceb420f2 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -121,6 +121,13 @@ public:
     void *device_workspace = nullptr,
     cudaStream_t stream = nullptr) const = 0;
 
+  // Set arguments that should only be set once before verifying or profiling the kernel.
+  // This should encompass any expensive operations that don't vary from run to run
+  // (e.g., max_active_clusters).
+  virtual Status initialize_with_arguments(void* arguments_ptr) const {
+    return Status::kSuccess;
+  }
+
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -389,6 +396,56 @@ struct BlockScaledGemmArguments {
   bool use_pdl{false};
 };
 
+/// Blockwise GEMM
+//
+// OperationKind: kBlockwiseGemm
+// GemmKind:      Universal
+
+struct BlockwiseGemmArguments {
+  // NOTE: these are replicated for 3.0 interfaces
+  gemm::GemmCoord problem_size{};
+  gemm::GemmCoord cluster_shape{};  
+  gemm::GemmCoord cluster_shape_fallback{}; 
+  int batch_count{1};
+
+  void const *A{nullptr};
+  void const *B{nullptr};
+  void const *SFA{nullptr};
+  void const *SFB{nullptr};
+  void const *C{nullptr};
+  void *D{nullptr};
+
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  // NOTE: these are replicated for 3.0 interfaces
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+
+  int sf_m_vec_size{0};
+  int sf_n_vec_size{0};
+  int sf_k_vec_size{0};
+
+  // Needed for some 3.x kernels
+  int sm_count{0};
+  library::RasterOrder raster_order{};
+  int swizzle_size{1};
+  int split_k_slices{1};
+
+  library::RuntimeDatatype runtime_input_datatype_a{library::RuntimeDatatype::kStatic}; 
+  library::RuntimeDatatype runtime_input_datatype_b{library::RuntimeDatatype::kStatic}; 
+
+  bool use_pdl{false};
+};
+
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -521,6 +578,8 @@ struct GemmGroupedArguments {
 
   // these should really be in the configuration but staying consistent with GEMM
   int sm_count{0};
+  int max_active_clusters{0};
+
   // The user is responsible for allocating storage for problem sizes.
   // Since GemmGroupedArguments is used by both the 2.x and 3.x APIs, we
   // unfortunately need to have both options in this struct, and the
@@ -536,6 +595,12 @@ struct GroupedGemmBlockScaledArguments : GemmGroupedArguments {
   void* norm_constant{nullptr};
 };
 
+struct GroupedGemmBlockwiseArguments : GemmGroupedArguments {
+  void* SFA{nullptr};
+  void* SFB{nullptr};
+};
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // OperationKind: kSparseGemm
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index 6a8655ce..f36232c8 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -427,6 +427,183 @@ using BlockScaledGemmOperationFunctionalMap = std::unordered_map<
   BlockScaledGemmFunctionalKeyHasher
 >;
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Blockwise Gemm Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying Gemm functional behavior
+struct BlockwiseGemmFunctionalKey {
+
+  Provider provider;
+  GemmKind gemm_kind;
+  OperationKind kind;
+  NumericTypeID element_compute;
+  NumericTypeID element_scalar;
+  NumericTypeID element_A;
+  LayoutTypeID layout_A;
+  NumericTypeID element_SFA;
+  NumericTypeID element_B;
+  LayoutTypeID layout_B;
+  NumericTypeID element_SFB;
+  NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+  //
+  // Methods
+  //
+
+  inline
+  BlockwiseGemmFunctionalKey(
+    Provider provider,
+    GemmKind gemm_kind = GemmKind::kGemm,
+    OperationKind kind = OperationKind::kBlockwiseGemm,
+    NumericTypeID element_compute = NumericTypeID::kF32,
+    NumericTypeID element_scalar = NumericTypeID::kF32,
+    NumericTypeID element_A = NumericTypeID::kF16,
+    LayoutTypeID layout_A = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFA = NumericTypeID::kF16,
+    NumericTypeID element_B = NumericTypeID::kF16,
+    LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFB = NumericTypeID::kF16,
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor,
+    int sfm_vec_size = 32,
+    int sfn_vec_size = 32,
+    int sfk_vec_size = 32
+  ):
+    provider(provider),
+    gemm_kind(gemm_kind),
+    kind(kind),
+    element_compute(element_compute),
+    element_scalar(element_scalar),
+    element_A(element_A),
+    layout_A(layout_A),
+    element_SFA(element_SFA),
+    element_B(element_B),
+    layout_B(layout_B),
+    element_SFB(element_SFB),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D),
+    SFMVecSize(sfm_vec_size),
+    SFNVecSize(sfn_vec_size),
+    SFKVecSize(sfk_vec_size)
+  { }
+
+  inline
+  bool operator==(BlockwiseGemmFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (gemm_kind == rhs.gemm_kind) &&
+      (kind == rhs.kind) &&
+      (element_compute == rhs.element_compute) &&
+      (element_scalar == rhs.element_scalar) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (element_SFA == rhs.element_SFA) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (element_SFB == rhs.element_SFB) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D) &&
+      (SFMVecSize == rhs.SFMVecSize) &&
+      (SFNVecSize == rhs.SFNVecSize) && 
+      (SFKVecSize == rhs.SFKVecSize);
+  }
+
+  inline
+  bool operator!=(BlockwiseGemmFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::library::BlockwiseGemmFunctionalKey const &k) {
+
+  out << "{\n"
+    << "         provider: " << to_string(k.provider) << "\n"
+    << "        gemm_kind: " << to_string(k.gemm_kind) << "\n"
+    << "             kind: " << to_string(k.kind) << "\n"
+    << "  element_compute: " << to_string(k.element_compute) << "\n"
+    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
+    << "        element_A: " << to_string(k.element_A) << "\n"
+    << "         layout_A: " << to_string(k.layout_A) << "\n"
+    << "      element_SFA: " << to_string(k.element_SFA) << "\n"
+    << "        element_B: " << to_string(k.element_B) << "\n"
+    << "         layout_B: " << to_string(k.layout_B) << "\n"
+    << "      element_SFB: " << to_string(k.element_SFB) << "\n"
+    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
+    << "        SFMVecSize: " << k.SFMVecSize << "\n"
+    << "        SFNVecSize: " << k.SFNVecSize << "\n"
+    << "        SFKVecSize: " << k.SFKVecSize << "\n"
+    << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function for BlockwiseGemmFunctionalKeyHasher
+struct  BlockwiseGemmFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(BlockwiseGemmFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)),           1) ^
+      rotl(hash(int(key.gemm_kind)),          2) ^
+      rotl(hash(int(key.kind)),               3) ^
+      rotl(hash(int(key.element_compute)),    4) ^
+      rotl(hash(int(key.element_scalar)),     5) ^
+      rotl(hash(int(key.element_A)),          6) ^
+      rotl(hash(int(key.layout_A)),           7) ^
+      rotl(hash(int(key.element_SFA)),        8) ^
+      rotl(hash(int(key.element_B)),          9) ^
+      rotl(hash(int(key.layout_B)),          10) ^
+      rotl(hash(int(key.element_SFB)),       11) ^
+      rotl(hash(int(key.element_C)),         12) ^
+      rotl(hash(int(key.layout_C)),          13) ^
+      rotl(hash(int(key.element_D)),         14) ^
+      rotl(hash(int(key.layout_D)),          15) ^
+      rotl(hash(int(key.SFMVecSize)),        16) ^ 
+      rotl(hash(int(key.SFNVecSize)),        17) ^ 
+      rotl(hash(int(key.SFKVecSize)),        18) 
+      ;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using BlockwiseGemmOperationFunctionalMap = std::unordered_map<
+  BlockwiseGemmFunctionalKey,
+  GemmOperationVectorMap,
+  BlockwiseGemmFunctionalKeyHasher
+>;
+
+
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //                          Data Structures for Conv Functional Maps
@@ -697,6 +874,9 @@ public:
   // provider (kCUTLASS, kReferenceHost, kReferenceDevice)                        
   BlockScaledGemmOperationFunctionalMap block_scaled_gemm_operations;             
 
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)                        
+  BlockwiseGemmOperationFunctionalMap blockwise_gemm_operations;             
+
   /// Map of all operations of type kConv2d
   // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
   ConvOperationFunctionalMap conv2d_operations;
diff --git a/tools/library/include/cutlass/library/types.h b/tools/library/include/cutlass/library/types.h
index ebc0b1bd..9f8c4ff1 100644
--- a/tools/library/include/cutlass/library/types.h
+++ b/tools/library/include/cutlass/library/types.h
@@ -143,6 +143,7 @@ enum class Provider {
 enum class OperationKind {
   kGemm,
   kBlockScaledGemm,
+  kBlockwiseGemm,
   kRankK,
   kRank2K,
   kTrmm,
diff --git a/tools/library/src/blockwise_gemm_operation_3x.hpp b/tools/library/src/blockwise_gemm_operation_3x.hpp
new file mode 100644
index 00000000..00347a99
--- /dev/null
+++ b/tools/library/src/blockwise_gemm_operation_3x.hpp
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "gemm_operation_3x.hpp"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class BlockwiseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::CollectiveMainloop::ElementA;
+  using ElementSFA = typename Operator::ElementAccumulator;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::CollectiveMainloop::ElementB;
+  using ElementSFB = typename Operator::ElementAccumulator;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+  
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB), 
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+private:
+  BlockwiseGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockwiseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+      GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {
+    description_.kind = OperationKind::kBlockwiseGemm;
+    description_.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    description_.SFA.layout = size<0,1>(typename CollectiveMainloop::LayoutSFA{}.stride()) == 1 ? 
+        LayoutTypeID::kColumnMajor : LayoutTypeID::kRowMajor;
+    description_.SFA.alignment = CollectiveMainloop::AlignmentSFA;
+    description_.SFA.log_extent_range = 32;
+    description_.SFA.log_stride_range = 32;
+
+    description_.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    description_.SFB.layout = size<0,1>(typename CollectiveMainloop::LayoutSFB{}.stride()) == 1 ? 
+        LayoutTypeID::kRowMajor : LayoutTypeID::kColumnMajor;
+    description_.SFB.alignment = CollectiveMainloop::AlignmentSFA;
+    description_.SFB.log_extent_range = 32;
+    description_.SFB.log_stride_range = 32;
+
+    description_.SFMVecSize = Operator::CollectiveMainloop::ScaleGranularityM;
+    description_.SFNVecSize = Operator::CollectiveMainloop::ScaleGranularityN;
+    description_.SFKVecSize = Operator::CollectiveMainloop::ScaleGranularityK;
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      description_.tile_description.cluster_shape = make_Coord(
+        Operator::ClusterShape::kM,
+        Operator::ClusterShape::kN,
+        Operator::ClusterShape::kK);
+    }
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentD);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  /// Returns the description of the GEMM operation
+  BlockwiseGemmDescription const& get_gemm_description() const {
+    return description_;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, BlockwiseGemmArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, BlockwiseGemmArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      BlockwiseGemmArguments const *arguments) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
+          {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2}, 
+          {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
+          {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
+      };
+
+      auto iter_runtime_a = mapping.find(arguments->runtime_input_datatype_a);
+      auto iter_runtime_b = mapping.find(arguments->runtime_input_datatype_b);
+
+      if (iter_runtime_a != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_a = iter_runtime_a->second;
+      } else {
+        assert("invalid runtime argument for datatype A!");
+      }
+
+      if (iter_runtime_b != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_b = iter_runtime_b->second;
+      } else {
+        assert("invalid runtime argument for datatype B!");
+      }
+
+   }
+    else {
+    
+    operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    } 
+    operator_args.mainloop.ptr_SFA = static_cast<ElementSFA const *>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB = static_cast<ElementSFB const *>(arguments->SFB);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.dA = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideA>(
+        arguments->lda, arguments->batch_stride_A);
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    operator_args.mainloop.layout_SFA = Operator::CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFA(operator_args.problem_shape);
+    operator_args.mainloop.layout_SFB = Operator::CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFB(operator_args.problem_shape);
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+    
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default: 
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (std::is_same_v<typename Operator::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      operator_args.scheduler.splits = arguments->split_k_slices;
+    }
+
+    
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
+    
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration = 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    BlockwiseGemmArguments const *arguments =
+      static_cast<BlockwiseGemmArguments const *>(arguments_ptr);
+
+    if (arguments->sf_m_vec_size != description_.SFMVecSize && arguments->sf_m_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (arguments->sf_n_vec_size != description_.SFNVecSize && arguments->sf_n_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (arguments->sf_k_vec_size != description_.SFKVecSize && arguments->sf_k_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *configuration) const override {
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+      void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<BlockwiseGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    Operator *op = new (host_workspace) Operator;
+    return Status::kSuccess;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration, 
+      void *host_workspace, 
+      void *device_workspace, 
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments args;
+    Status status = update_arguments_(args, static_cast<BlockwiseGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(args, device_workspace, stream, nullptr, static_cast<BlockwiseGemmArguments const *>(arguments_ptr)->use_pdl);
+    return status;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/grouped_gemm_operation_3x.hpp b/tools/library/src/grouped_gemm_operation_3x.hpp
index e94d7c73..7c67ae3a 100644
--- a/tools/library/src/grouped_gemm_operation_3x.hpp
+++ b/tools/library/src/grouped_gemm_operation_3x.hpp
@@ -72,19 +72,6 @@ public:
 
     this->description_.gemm = GemmOperation3xBase<Operator_>::description_;
     this->description_.tile_description = this->description_.gemm.tile_description;
-
-    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
-      dim3 cluster_dims(
-        cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
-        cute::size<1>(typename Operator::GemmKernel::ClusterShape{}),
-        cute::size<2>(typename Operator::GemmKernel::ClusterShape{}));
-      uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
-      void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
-      max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
-        cluster_dims,
-        threads_per_block,
-        kernel_ptr);
-    }
   };
 
 public:
@@ -102,7 +89,6 @@ public:
 
 protected:
   library::GroupedGemmDescription description_;
-  int max_active_clusters;
 
   Status initialize_strides(GemmGroupedConfiguration const& config) const {
     auto const num_groups = config.problem_count;
@@ -182,7 +168,7 @@ protected:
 
     operator_args.hw_info.sm_count = arguments.sm_count;
     if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
-      operator_args.hw_info.max_active_clusters = max_active_clusters;
+      operator_args.hw_info.max_active_clusters = arguments.max_active_clusters;
     }
     if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
       operator_args.hw_info.cluster_shape =
@@ -343,6 +329,47 @@ public:
     status = op->run(operator_args, device_workspace, stream, nullptr, args.use_pdl);
     return status;
   }
+
+
+  // Set arguments that should only be set once before verifying or profiling the kernel.
+  // This should encompass any expensive operations that don't vary from run to run
+  // (e.g., max_active_clusters).
+  Status initialize_with_arguments(void* arguments_ptr) const override {
+    if constexpr (Operator::ArchTag::kMinComputeCapability < 90) {
+      return Status::kSuccess;
+    }
+
+    GemmGroupedArguments* args = static_cast<GemmGroupedArguments*>(arguments_ptr);
+
+    dim3 cluster_dims;
+    if constexpr (cute::is_static_v<typename Operator::GemmKernel::ClusterShape>) {
+      cluster_dims = dim3(
+        cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<1>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<2>(typename Operator::GemmKernel::ClusterShape{})
+      );
+    }
+    else {
+      cluster_dims = dim3(
+        args->cluster_shape.m(),
+        args->cluster_shape.n(),
+        args->cluster_shape.k()
+      );      
+    }
+
+    uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
+    void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+    args->max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
+      cluster_dims,
+      threads_per_block,
+      kernel_ptr);
+    
+    if (args->max_active_clusters == 0) {
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
 };
 
 template <typename Operator_>
@@ -375,6 +402,7 @@ public:
       : GroupedGemmOperation3xBase<Operator_>(name) {
 
     BlockScaleDescription block_scaled_desc{};
+    block_scaled_desc.kind = OperationKind::kBlockScaledGemm;
     block_scaled_desc.SFA.element = NumericTypeMap<ElementSFA>::kId;
     block_scaled_desc.SFA.layout = LayoutTypeID::kRowMajor;
     block_scaled_desc.SFA.alignment = 128;
@@ -387,7 +415,9 @@ public:
     block_scaled_desc.SFB.log_extent_range = 32;
     block_scaled_desc.SFB.log_stride_range = 32;
 
-    block_scaled_desc.SFVecSize = SFVecSize;
+    block_scaled_desc.SFMVecSize = 1;
+    block_scaled_desc.SFNVecSize = 1;
+    block_scaled_desc.SFKVecSize = SFVecSize;
 
     block_scaled_desc.SFD = make_TensorDescription<ElementSFD, LayoutSFD>(128);
     block_scaled_desc.EpilogueSFVecSize = SFD_VectorSize;
@@ -555,4 +585,206 @@ public:
   }
 };
 
+template <typename Operator_>
+class GroupedBlockwiseGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementSFA = typename Operator::ElementAccumulator;
+  using ElementSFB = typename Operator::ElementAccumulator;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+
+  GroupedBlockwiseGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {
+
+    BlockScaleDescription blockwise_desc{};
+    blockwise_desc.kind = OperationKind::kBlockwiseGemm;
+    blockwise_desc.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    blockwise_desc.SFA.layout = size<0,1>(typename CollectiveMainloop::InternalLayoutSFA{}.stride()) == 1 ? 
+        LayoutTypeID::kColumnMajor : LayoutTypeID::kRowMajor;
+    blockwise_desc.SFA.alignment = CollectiveMainloop::AlignmentSFA;
+    blockwise_desc.SFA.log_extent_range = 32;
+    blockwise_desc.SFA.log_stride_range = 32;
+
+    blockwise_desc.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    blockwise_desc.SFB.layout = size<0,1>(typename CollectiveMainloop::InternalLayoutSFB{}.stride()) == 1 ? 
+        LayoutTypeID::kRowMajor : LayoutTypeID::kColumnMajor;
+    blockwise_desc.SFB.alignment = CollectiveMainloop::AlignmentSFA;
+    blockwise_desc.SFB.log_extent_range = 32;
+    blockwise_desc.SFB.log_stride_range = 32;
+
+    blockwise_desc.SFMVecSize = Operator::CollectiveMainloop::ScaleGranularityM;
+    blockwise_desc.SFNVecSize = Operator::CollectiveMainloop::ScaleGranularityN;
+    blockwise_desc.SFKVecSize = Operator::CollectiveMainloop::ScaleGranularityK;
+
+    blockwise_desc.EpilogueSFVecSize = 0;
+
+    this->description_.block_scales = blockwise_desc;
+  }
+
+  ~GroupedBlockwiseGemmUniversal3xOperation() override = default;
+
+  mutable CudaBuffer layout_SFA_device;
+  mutable CudaBuffer layout_SFB_device;
+
+protected:
+  template <class FusionArgs, class = void> struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmGroupedArguments const& arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template <class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status
+    update_(FusionArgs& fusion_args, GroupedGemmBlockwiseArguments const& arguments) {
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
+    }
+  };
+
+public:
+  /// Returns success if the operation can proceed
+  Status can_implement([[maybe_unused]] void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+    GroupedGemmBlockwiseArguments const* arguments =
+      static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr);
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operator::can_implement(args);
+    return status;
+  }
+
+  Status update_arguments_(
+    OperatorArguments& operator_args,
+    GroupedGemmBlockwiseArguments const* arguments) const {
+    Status status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread,
+      *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.mainloop.ptr_SFA =
+      static_cast<const typename Operator::GemmKernel::ElementAccumulator**>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB =
+      static_cast<const typename Operator::GemmKernel::ElementAccumulator**>(arguments->SFB);
+
+    operator_args.mainloop.layout_SFA =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFA*>(this->layout_SFA_device.data());
+    operator_args.mainloop.layout_SFB =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFB*>(this->layout_SFB_device.data());
+
+    return this->update_arguments_base(operator_args, *arguments);
+  }
+
+  uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+
+    OperatorArguments args;
+    auto status =
+      update_arguments_(args, static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  /// **** CAUTION ****
+  /// Must be called when lda, ldb, ldc, or ldd change.
+  /// The CUTLASS library stores the operations in a type-
+  /// erased manifest. Therefore, only this class knows
+  /// the type of strideA, strideB, strideC, and strideD.
+  /// Since grouped GEMM needs to allocate storage for
+  /// the strides on device, the concrete type of the stride
+  /// must be known in order to copy in the correct memory
+  /// layout on device.
+  Status initialize(
+    void const* configuration_ptr,
+    void* host_workspace,
+    void* device_workspace,
+    cudaStream_t stream = nullptr) const override {
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    auto status = this->initialize_strides(config);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto num_groups = config.problem_count;
+    this->layout_SFA_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups);
+    this->layout_SFB_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups);
+    auto layout_SFA_host = std::vector<typename CollectiveMainloop::InternalLayoutSFA>(num_groups);
+    auto layout_SFB_host = std::vector<typename CollectiveMainloop::InternalLayoutSFB>(num_groups);
+
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      auto const& shape = config.problem_sizes_3x_host[group_idx];
+      auto M = get<0>(shape);
+      auto N = get<1>(shape);
+      auto K = get<2>(shape);
+
+      auto layout_SFA = CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+      auto layout_SFB = CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+      layout_SFA_host[group_idx] = layout_SFA;
+      layout_SFB_host[group_idx] = layout_SFB;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFA_device.data(),
+      layout_SFA_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFB_device.data(),
+      layout_SFB_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups,
+      cudaMemcpyHostToDevice));
+
+    Operator* op = new (host_workspace) Operator;
+    return status;
+  }
+
+  /// **** CAUTION ****
+  /// initialize() must be called if lda, ldb, ldc, or ldd change.
+  Status run(
+    void const* arguments_ptr,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+    auto const& args = *static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr);
+
+    Status status = update_arguments_(operator_args, &args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator* op = static_cast<Operator*>(host_workspace);
+    status = op->run(operator_args, device_workspace, stream, nullptr);
+    return status;
+  }
+};
+
+
 } // namespace cutlass::library
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index bceeabf6..a14ce592 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -86,6 +86,41 @@ void OperationTable::append(Manifest const &manifest) {
       block_scaled_gemm_operations[functional_key][preference_key].push_back(op);
     }
 
+    if (desc.kind == OperationKind::kBlockwiseGemm) {
+      BlockwiseGemmDescription const &gemm_desc = static_cast<BlockwiseGemmDescription const &>(desc);
+
+      BlockwiseGemmFunctionalKey functional_key(
+        gemm_desc.provider,
+        gemm_desc.gemm_kind,
+        gemm_desc.kind,
+        gemm_desc.tile_description.math_instruction.element_accumulator,
+        gemm_desc.element_epilogue,        
+        gemm_desc.A.element,
+        gemm_desc.A.layout,
+        gemm_desc.SFA.element,
+        gemm_desc.B.element,
+        gemm_desc.B.layout,
+        gemm_desc.SFB.element,
+        gemm_desc.C.element,
+        gemm_desc.C.layout,
+        gemm_desc.D.element,
+        gemm_desc.D.layout,
+        gemm_desc.SFMVecSize,
+        gemm_desc.SFNVecSize,
+        gemm_desc.SFKVecSize
+      );
+ 
+      Operation const *op = operation.get();
+
+      int cc = gemm_desc.tile_description.minimum_compute_capability;
+        
+      int alignment = std::max(std::max(
+        gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment);
+
+      GemmPreferenceKey preference_key(cc, alignment);
+
+      blockwise_gemm_operations[functional_key][preference_key].push_back(op);
+    }
 
     // insert all gemm operation into operation table
     if (desc.kind == OperationKind::kGemm) {
@@ -157,29 +192,57 @@ void OperationTable::append(Manifest const &manifest) {
       }
       else {
         const BlockScaleDescription &block_scale_desc = grouped_gemm_desc.block_scales.value();
-        BlockScaledGemmFunctionalKey functional_key(
-          gemm_desc.provider,
-          gemm_desc.gemm_kind,
-          gemm_desc.kind,
-          gemm_desc.tile_description.math_instruction.element_accumulator,
-          gemm_desc.element_epilogue,
-          gemm_desc.A.element,
-          gemm_desc.A.layout,
-          block_scale_desc.SFA.element,
-          gemm_desc.B.element,
-          gemm_desc.B.layout,
-          block_scale_desc.SFB.element,
-          gemm_desc.C.element,
-          gemm_desc.C.layout,
-          gemm_desc.D.element,
-          gemm_desc.D.layout,
-          block_scale_desc.SFD.element,
-          block_scale_desc.SFD.layout,
-          block_scale_desc.SFVecSize,
-          block_scale_desc.EpilogueSFVecSize
-        );
+        if (block_scale_desc.kind == OperationKind::kBlockScaledGemm) {
+          
+          BlockScaledGemmFunctionalKey functional_key(
+            gemm_desc.provider,
+            gemm_desc.gemm_kind,
+            gemm_desc.kind,
+            gemm_desc.tile_description.math_instruction.element_accumulator,
+            gemm_desc.element_epilogue,
+            gemm_desc.A.element,
+            gemm_desc.A.layout,
+            block_scale_desc.SFA.element,
+            gemm_desc.B.element,
+            gemm_desc.B.layout,
+            block_scale_desc.SFB.element,
+            gemm_desc.C.element,
+            gemm_desc.C.layout,
+            gemm_desc.D.element,
+            gemm_desc.D.layout,
+            block_scale_desc.SFD.element,
+            block_scale_desc.SFD.layout,
+            block_scale_desc.SFKVecSize,
+            block_scale_desc.EpilogueSFVecSize
+          );
 
-        block_scaled_gemm_operations[functional_key][preference_key].push_back(op);
+          block_scaled_gemm_operations[functional_key][preference_key].push_back(op);
+        }
+        else {
+          assert(block_scale_desc.kind == OperationKind::kBlockwiseGemm);
+          BlockwiseGemmFunctionalKey functional_key(
+            gemm_desc.provider,
+            gemm_desc.gemm_kind,
+            gemm_desc.kind,
+            gemm_desc.tile_description.math_instruction.element_accumulator,
+            gemm_desc.element_epilogue,        
+            gemm_desc.A.element,
+            gemm_desc.A.layout,
+            block_scale_desc.SFA.element,
+            gemm_desc.B.element,
+            gemm_desc.B.layout,
+            block_scale_desc.SFB.element,
+            gemm_desc.C.element,
+            gemm_desc.C.layout,
+            gemm_desc.D.element,
+            gemm_desc.D.layout,
+            block_scale_desc.SFMVecSize,
+            block_scale_desc.SFNVecSize,
+            block_scale_desc.SFKVecSize
+          );
+
+          blockwise_gemm_operations[functional_key][preference_key].push_back(op);
+        }
       }
     }
 
diff --git a/tools/library/src/reference/blockwise_gemm_fp8_bf16out.cu b/tools/library/src/reference/blockwise_gemm_fp8_bf16out.cu
new file mode 100644
index 00000000..e99cb1a3
--- /dev/null
+++ b/tools/library/src/reference/blockwise_gemm_fp8_bf16out.cu
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "blockwise_gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_blockwise_gemm_reference_operations_bf16out(Manifest &manifest) {
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<void, bfloat16_t>(manifest);
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<bfloat16_t, bfloat16_t>(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reference/blockwise_gemm_fp8_fp16out.cu b/tools/library/src/reference/blockwise_gemm_fp8_fp16out.cu
new file mode 100644
index 00000000..eb5c20d1
--- /dev/null
+++ b/tools/library/src/reference/blockwise_gemm_fp8_fp16out.cu
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "blockwise_gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_blockwise_gemm_reference_operations_fp16out(Manifest &manifest) {
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<void, half_t>(manifest);
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<half_t, half_t>(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reference/blockwise_gemm_fp8_fp32out.cu b/tools/library/src/reference/blockwise_gemm_fp8_fp32out.cu
new file mode 100644
index 00000000..b0b8d9f2
--- /dev/null
+++ b/tools/library/src/reference/blockwise_gemm_fp8_fp32out.cu
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "blockwise_gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_blockwise_gemm_reference_operations_fp32out(Manifest &manifest) {
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<void, float>(manifest);
+  initialize_blockwise_gemm_reference_operations_given_C_and_D<float, float>(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reference/blockwise_gemm_reference_operation.h b/tools/library/src/reference/blockwise_gemm_reference_operation.h
new file mode 100644
index 00000000..591a5ce3
--- /dev/null
+++ b/tools/library/src/reference/blockwise_gemm_reference_operation.h
@@ -0,0 +1,664 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines reference operations for blockwise/groupwise GEMM operation kinds in CUTLASS Library
+*/
+
+
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "library_internal.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  typename ElementA_, 
+  typename LayoutA_,
+  typename LayoutSFA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename LayoutSFB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class BlockwiseGemmReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementSFA = ElementSFA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementSFB = ElementSFB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementD = ElementD_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  BlockwiseGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockwiseGemmReferenceOperation(int SFMVecSize_, int SFNVecSize_, int SFKVecSize_)
+    : SFMVecSize(SFMVecSize_), SFNVecSize(SFNVecSize_), SFKVecSize(SFKVecSize_) {
+    
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = OperationKind::kBlockwiseGemm;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.SFA = make_TensorDescription<ElementSFA, LayoutSFA_>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.SFB = make_TensorDescription<ElementSFB, LayoutSFB_>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.D = make_TensorDescription<ElementD, LayoutC>();
+    
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Compute capability for gemm reference
+    description_.tile_description.minimum_compute_capability = 
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    description_.SFMVecSize = SFMVecSize;
+    description_.SFNVecSize = SFNVecSize;
+    description_.SFKVecSize = SFKVecSize;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "gemm"  
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.SFA.element) << SFMVecSize << "x" << SFKVecSize << to_string(description_.SFA.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.SFB.element)  << SFNVecSize << "x" << SFKVecSize << to_string(description_.SFB.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(GemmUniversalConfiguration);
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    using namespace cute;
+
+    BlockwiseGemmArguments const &args = *static_cast<BlockwiseGemmArguments const *>(arguments);
+
+    // Construct cute::Tensor A/B/C 
+
+    int M = args.problem_size.m();
+    int N = args.problem_size.n();
+    int K = args.problem_size.k();
+    int L = args.batch_count;
+
+    auto problem_shape_MNKL = cute::make_shape(M, N, K, L);
+
+    auto alpha = *(static_cast<ElementCompute const*>(args.alpha));
+    auto beta = *(static_cast<ElementCompute const*>(args.beta));
+
+    using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+    using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+    using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+    using StrideD = cutlass::gemm::TagToStrideC_t<LayoutC>;
+
+    auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    auto stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    auto stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    auto stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+    using BlockwiseConfig = cutlass::detail::RuntimeBlockwiseScaleConfig<>;
+    auto A = cute::make_tensor(static_cast<ElementA const*>(args.A),
+        cute::make_layout(cute::make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(static_cast<ElementSFA const*>(args.SFA), BlockwiseConfig::tile_atom_to_shape_SFA(problem_shape_MNKL, cute::make_tuple(SFMVecSize, SFNVecSize, SFKVecSize)));
+
+    auto B = cute::make_tensor(static_cast<ElementB const*>(args.B),
+        cute::make_layout(cute::make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(static_cast<ElementSFB const*>(args.SFB), BlockwiseConfig::tile_atom_to_shape_SFB(problem_shape_MNKL, cute::make_tuple(SFMVecSize, SFNVecSize, SFKVecSize)));
+
+    auto C = [&]() {
+      if constexpr (not is_same_v<ElementC, void>) {
+        return cute::make_tensor(static_cast<ElementC const*>(args.C),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+      else {
+        return cute::make_tensor(static_cast<ElementD const*>(nullptr),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+    }();
+
+    auto D = cute::make_tensor(static_cast<ElementD *>(args.D),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+
+    cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator, 
+        decltype(A), decltype(SfA), 
+        decltype(B), decltype(SfB)> 
+        mainloop_params{A, SfA, B, SfB};
+
+    //  W/O SF generation
+    cutlass::reference::host::GettEpilogueParams<
+        ElementCompute, ElementAccumulator, ElementAccumulator, ElementCompute,
+        decltype(C), decltype(D)>
+        epilogue_params{alpha, beta, C, D};
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    return Status::kSuccess;
+  }
+
+private:
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_blockwise_gemm(Manifest &manifest, int SFMVecSize, int SFNVecSize, int SFKVecSize) {
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+}
+
+template<class ElementC,
+         class ElementD>
+void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &manifest) {
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+
+
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+
+
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index 33e6e9a8..3a80d118 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -64,6 +64,10 @@ void initialize_gemm_reference_operations_f8_f6_f32(Manifest &manifest);
 void initialize_block_scaled_gemm_reference_operations_fp4a_vs16(Manifest &manifest);
 void initialize_block_scaled_gemm_reference_operations_fp4a_vs32(Manifest &manifest);
 void initialize_block_scaled_gemm_reference_operations_mixed8bitsa(Manifest &manifest);
+void initialize_blockwise_gemm_reference_operations_fp32out(Manifest &manifest);
+void initialize_blockwise_gemm_reference_operations_fp16out(Manifest &manifest);
+void initialize_blockwise_gemm_reference_operations_bf16out(Manifest &manifest);
+
 void initialize_gemm_reference_operations_fp8in_fp16out(Manifest &manifest);
 void initialize_gemm_reference_operations_fp8in_bf16out(Manifest &manifest);
 void initialize_gemm_reference_operations_fp8in_fp32out(Manifest &manifest);
@@ -113,6 +117,9 @@ void initialize_reference_operations(Manifest &manifest) {
   initialize_block_scaled_gemm_reference_operations_fp4a_vs16(manifest);
   initialize_block_scaled_gemm_reference_operations_fp4a_vs32(manifest);
   initialize_block_scaled_gemm_reference_operations_mixed8bitsa(manifest);
+  initialize_blockwise_gemm_reference_operations_fp32out(manifest);
+  initialize_blockwise_gemm_reference_operations_fp16out(manifest);
+  initialize_blockwise_gemm_reference_operations_bf16out(manifest);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index 525b4794..ecc636a1 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -334,6 +334,7 @@ static struct {
   {"eq_gemm", "EqGemm", OperationKind::kEqGemm},
   {"gemm", "Gemm", OperationKind::kGemm},
   {"block_scaled_gemm", "blockScaledGemm", OperationKind::kBlockScaledGemm}, 
+  {"blockwise_gemm", "blockwiseGemm", OperationKind::kBlockwiseGemm}, 
   {"rank_k", "RankK", OperationKind::kRankK},
   {"rank_2k", "Rank2K", OperationKind::kRank2K},
   {"trmm", "Trmm", OperationKind::kTrmm},
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 53f13ab9..f2b10dac 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -48,6 +48,7 @@ set(CUTLASS_TOOLS_PROFILER_SOURCES
   src/gemm_operation_profiler.cu
   src/grouped_gemm_operation_profiler.cu
   src/block_scaled_gemm_operation_profiler.cu
+  src/blockwise_gemm_operation_profiler.cu
   src/rank_k_operation_profiler.cu
   src/rank_2k_operation_profiler.cu
   src/trmm_operation_profiler.cu
diff --git a/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h
new file mode 100644
index 00000000..c6a1aa35
--- /dev/null
+++ b/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h
@@ -0,0 +1,290 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Blockscale Gemm Profiler
+*/
+
+
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class BlockwiseGemmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct GemmProblem {
+
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
+
+    int64_t m{16};
+    int64_t n{16};
+    int64_t k{16};
+
+    int64_t sf_vec_m{0};
+    int64_t sf_vec_n{0};
+    int64_t sf_vec_k{0};
+    
+    int cluster_m{1};
+    int cluster_n{1};
+    int cluster_k{1};
+    int cluster_m_fallback{1};
+    int cluster_n_fallback{1};
+    int cluster_k_fallback{1};
+    
+
+    int64_t lda{0};
+    int64_t ldb{0};
+    int64_t ldc{0};
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    cutlass::library::SplitKMode split_k_mode{library::SplitKMode::kNone};
+    int split_k_slices{1};
+    int batch_count{1};
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
+
+    
+    cutlass::library::RuntimeDatatype runtime_input_datatype_a{};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_b{};
+    
+
+    // gemm with parallel interleaved reduction
+    // gemm epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    bool use_pdl{false};
+    //
+    // Methods
+    //
+
+    /// Parses the problem
+    Status parse(
+      library::BlockwiseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::BlockwiseGemmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::BlockwiseGemmDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::BlockwiseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct GemmWorkspace {
+
+    DeviceAllocation *A{nullptr};
+    DeviceAllocation *SFA{nullptr};
+    DeviceAllocation *B{nullptr};
+    DeviceAllocation *SFB{nullptr};
+    DeviceAllocation *C{nullptr};
+    DeviceAllocation *Computed{nullptr};
+    DeviceAllocation *Reference{nullptr};
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count{1};
+
+    library::GemmUniversalConfiguration configuration;
+    library::BlockwiseGemmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  GemmProblem problem_;
+
+  /// Device memory allocations 
+  GemmWorkspace gemm_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* gemm operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  BlockwiseGemmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~BlockwiseGemmOperationProfiler();
+
+  GemmProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::BlockwiseGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against host and device references
+  bool verify_with_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
+
+  /// Method to profile a CUTLASS Operation
+  Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    library::Operation const *operation,
+    ProblemSpace::Problem const &problem);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
index c34ac038..6a4803c7 100644
--- a/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
@@ -198,6 +198,11 @@ private:
       arguments.SFD = block_scaled_ws.SFD_ptr_array_device[0]->data();
       arguments.norm_constant = block_scaled_ws.norm_constant->data();
     }
+    else if (is_blockwise) {
+      auto& block_scaled_ws = gemm_workspace_.block_scales.value();
+      arguments.SFA = block_scaled_ws.SFA_ptr_array_device[0]->data();
+      arguments.SFB = block_scaled_ws.SFB_ptr_array_device[0]->data();
+    }
   }
 
 protected:
@@ -208,6 +213,7 @@ protected:
   GroupedGemmWorkspace gemm_workspace_;
 
   bool is_block_scaled{false};
+  bool is_blockwise{false};
 
 public:
   GroupedGemmOperationProfiler(Options const& options);
diff --git a/tools/profiler/src/block_scaled_gemm_operation_profiler.cu b/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
index 33284185..bb8940b4 100644
--- a/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
+++ b/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
@@ -437,9 +437,11 @@ void BlockScaledGemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "k", problem_space, k);
 
   
-  set_argument(result, "cluster_m", problem_space, cluster_m);
-  set_argument(result, "cluster_n", problem_space, cluster_n);
-  set_argument(result, "cluster_k", problem_space, cluster_k);
+  auto cluster_shape = operation_desc.tile_description.cluster_shape;
+  auto is_dynamic = cluster_shape.m() == 0 || cluster_shape.n() == 0 || cluster_shape.k() == 0;
+  set_argument(result, "cluster_m", problem_space, is_dynamic ? this->cluster_m : cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, is_dynamic ? this->cluster_n : cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, is_dynamic ? this->cluster_k : cluster_shape.k());
   set_argument(result, "cluster_m_fallback", problem_space, cluster_m_fallback);
   set_argument(result, "cluster_n_fallback", problem_space, cluster_n_fallback);
   set_argument(result, "cluster_k_fallback", problem_space, cluster_k_fallback);
diff --git a/tools/profiler/src/blockwise_gemm_operation_profiler.cu b/tools/profiler/src/blockwise_gemm_operation_profiler.cu
new file mode 100644
index 00000000..5716869d
--- /dev/null
+++ b/tools/profiler/src/blockwise_gemm_operation_profiler.cu
@@ -0,0 +1,1299 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+*/
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+#include <vector>
+
+#include "cutlass/core_io.h"
+
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/blockwise_gemm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
+#include "cutlass/library/singleton.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/handle.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+BlockwiseGemmOperationProfiler::BlockwiseGemmOperationProfiler(Options const &options):
+  OperationProfiler(
+    options,
+    library::OperationKind::kBlockwiseGemm,
+    {
+      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (universal, gemm, planar_complex, planar_complex_array)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"scale_vec_size_m", "scale-vec-size-m"}, "Scale vector size in GEMM M dimension"},
+      {ArgumentTypeID::kInteger, {"scale_vec_size_n", "scale-vec-size-n"}, "Scale vector size in GEMM N dimension"},
+      {ArgumentTypeID::kInteger, {"scale_vec_size_k", "scale-vec-size-k"}, "Scale vector size in GEMM K dimension"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
+      {ArgumentTypeID::kEnumerated, {"runtime_input_datatype_a", "runtime-input-datatype::a"}, "Runtime datatype (e4m3, e5m2, e3m2, e2m3, e2m1)"}, 
+      {ArgumentTypeID::kEnumerated, {"runtime_input_datatype_b", "runtime-input-datatype::b"}, "Runtime datatype (e4m3, e5m2, e3m2, e2m3, e2m1)"}, 
+      {ArgumentTypeID::kEnumerated, {"raster_order", "raster-order"}, "Raster order (heuristic, along_n, along_m)"},
+      {ArgumentTypeID::kInteger, {"swizzle_size", "swizzle-size"}, "Size to swizzle"},
+      {ArgumentTypeID::kEnumerated, {"use_pdl", "use_pdl"}, "Use PDL (true, false)"},
+    },
+    { library::Provider::kCUBLAS}
+  ) {
+
+  description_ = "      General matrix-matrix product. D = alpha * A*B + beta * C";
+}
+
+/// Destructor
+BlockwiseGemmOperationProfiler::~BlockwiseGemmOperationProfiler() {
+
+}
+
+/// Prints usage statement for the math function
+void BlockwiseGemmOperationProfiler::print_usage(std::ostream &out) const {
+  out << "Blockwise GEMM" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void BlockwiseGemmOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+    << "Profile a particular problem size:\n"
+    << "  $ cutlass_profiler --operation=blockwise_gemm --m=1024 --n=1024 --k=128\n\n"
+
+    << "Schmoo over problem size and beta:\n"
+    << "  $ cutlass_profiler --operation=blockwise_gemm --m=1024:4096:256 --n=1024:4096:256 --k=128:8192:128 --beta=0,1,2.5\n\n"
+
+    << "For column major, use column, col, or n. For row major use, row or t:\n"
+    << "  $ cutlass_profiler --operation=Gemm --A=f16:column --B=*:row\n\n"
+
+    << "Profile a particular problem size with split K and parallel reduction:\n"
+    << "  $ cutlass_profiler --operation=Gemm --split_k_mode=parallel --split_k_slices=2 --m=1024 --n=1024 --k=128\n\n"
+
+    << "Using various input value distribution:\n"
+    << "  $ cutlass_profiler --operation=Gemm --dist=uniform,min:0,max:3\n"
+    << "  $ cutlass_profiler --operation=Gemm --dist=gaussian,mean:0,stddev:3\n"
+    << "  $ cutlass_profiler --operation=Gemm --dist=sequential,start:0,delta:1\n\n"
+
+    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
+    << " $ cutlass_profiler --operation=Gemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
+
+    << "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
+    << " $ cutlass_profiler  --operation=Gemm \\ \n"
+    << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --k=8,16,32,64,128,256,288,384,504,512,520 \\ \n"
+    << "   --beta=0,1,2 --profiling-iterations=1 \\ \n"
+    << "   --providers=cutlass --output=functional-test.csv\n\n";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+Status BlockwiseGemmOperationProfiler::GemmProblem::parse(
+  library::BlockwiseGemmDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  this->mode = library::GemmUniversalMode::kGemm;
+
+  if (!arg_as_int(this->m, "m", problem_space, problem)) {
+    // default value
+    this->m = 1024;
+  }
+
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
+    // default value
+    this->n = 1024;
+  }
+
+  if (!arg_as_int(this->k, "k", problem_space, problem)) {
+    // default value
+    this->k = 1024;
+  }
+
+  if (!arg_as_int(this->sf_vec_m, "scale_vec_size_m", problem_space, problem)) {
+    // default value
+    this->sf_vec_m = 0;
+  }
+
+  if (!arg_as_int(this->sf_vec_n, "scale_vec_size_n", problem_space, problem)) {
+    // default value
+    this->sf_vec_n = 0;
+  }
+
+  if (!arg_as_int(this->sf_vec_k, "scale_vec_size_k", problem_space, problem)) {
+    // default value
+    this->sf_vec_k = 0;
+  }
+  
+  if (!arg_as_int(this->cluster_m, "cluster_m", problem_space, problem)) {
+    // default value
+    this->cluster_m = 1;
+  }
+
+  if (!arg_as_int(this->cluster_n, "cluster_n", problem_space, problem)) {
+    // default value
+    this->cluster_n = 1;
+  }
+
+  if (!arg_as_int(this->cluster_k, "cluster_k", problem_space, problem)) {
+    // default value
+    this->cluster_k = 1;
+  }
+
+  if (!arg_as_int(this->cluster_m_fallback, "cluster_m_fallback", problem_space, problem)) {
+    // default value
+    this->cluster_m_fallback = 0;
+  }
+
+  if (!arg_as_int(this->cluster_n_fallback, "cluster_n_fallback", problem_space, problem)) {
+    // default value
+    this->cluster_n_fallback = 0;
+  }
+
+  if (!arg_as_int(this->cluster_k_fallback, "cluster_k_fallback", problem_space, problem)) {
+    // default value
+    this->cluster_k_fallback = 0;
+  }
+  
+
+  if (!arg_as_SplitKModeID(this->split_k_mode, "split_k_mode", problem_space, problem)) {
+    // default value
+    this->split_k_mode = library::SplitKMode::kSerial;
+  }
+
+  this->mode = library::GemmUniversalMode::kGemm;
+  if (this->split_k_mode == library::SplitKMode::kParallel) {
+    this->mode = library::GemmUniversalMode::kGemmSplitKParallel;
+  }
+
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    this->split_k_slices = 1;
+  }
+
+  if (this->split_k_mode != library::SplitKMode::kSerial) {
+    std::cout<<"SplitK/StreamK feature is not supported yet!";
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_bool(this->use_pdl, "use_pdl", problem_space, problem)) {
+    // default value
+    this->use_pdl = false;
+  }
+
+  
+  if (!arg_as_RuntimeDatatype(this->runtime_input_datatype_a, "runtime_input_datatype_a", problem_space, problem)) {
+    // default value
+    this->runtime_input_datatype_a = cutlass::library::RuntimeDatatype::kStatic;
+  }
+
+  if (!arg_as_RuntimeDatatype(this->runtime_input_datatype_b, "runtime_input_datatype_b", problem_space, problem)) {
+    // default value
+    this->runtime_input_datatype_b = cutlass::library::RuntimeDatatype::kStatic;
+  }
+  
+
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
+    // default value
+    this->batch_count = 1;
+  } else if (this->batch_count > 1) {
+    this->mode = library::GemmUniversalMode::kBatched;
+  }
+
+  if (!arg_as_int(this->swizzle_size, "swizzle_size", problem_space, problem)) {
+    // default value
+    this->swizzle_size = 1;
+  }
+
+  if (!arg_as_RasterOrder(this->raster_order, "raster_order", problem_space, problem)) {
+    // default value
+    this->raster_order = library::RasterOrder::kHeuristic;
+  }
+
+  if (this->split_k_slices > 1 && this->batch_count > 1) {
+    // At least one of these must be one
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.B, "B", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.C, "C", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
+    problem)) {
+
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  if (!arg_as_scalar(
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
+    problem)) {
+
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  this->lda = DeviceAllocation::get_packed_layout(
+    operation_desc.A.layout, {int(this->m), int(this->k)}).front();
+
+  this->ldb = DeviceAllocation::get_packed_layout(
+    operation_desc.B.layout, {int(this->k), int(this->n)}).front();
+
+  this->ldc = DeviceAllocation::get_packed_layout(
+    operation_desc.C.layout, {int(this->m), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Total number of bytes loaded
+int64_t BlockwiseGemmOperationProfiler::GemmProblem::bytes(library::BlockwiseGemmDescription const &operation_desc) const {
+  // Input bytes read and Output bytes written for the gemm problem
+  int64_t bytes =
+    int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * k +
+    int64_t(library::sizeof_bits(operation_desc.B.element) * n / 8) * k +
+    int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+  }
+
+  bytes *= batch_count;
+
+  return bytes;
+}
+
+/// Total number of flops computed
+int64_t BlockwiseGemmOperationProfiler::GemmProblem::flops(library::BlockwiseGemmDescription const &operation_desc) const {
+  int64_t flops_ = (int64_t(m) * n * k + m * n) * 2 * batch_count;
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddGaussianComplex:
+    flops_ *= 3;
+    break;
+
+  default: break;
+  }
+
+  return flops_;
+}
+
+
+/// Initializes a performance result
+void BlockwiseGemmOperationProfiler::GemmProblem::initialize_result(
+  PerformanceResult &result,
+  library::BlockwiseGemmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "B", problem_space,
+    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
+
+  set_argument(result, "C", problem_space,
+    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
+
+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
+  set_argument(result, "m", problem_space, m);
+  set_argument(result, "n", problem_space, n);
+  set_argument(result, "k", problem_space, k);
+
+  set_argument(result, "scale_vec_size_m", problem_space, sf_vec_m);
+  set_argument(result, "scale_vec_size_n", problem_space, sf_vec_n);
+  set_argument(result, "scale_vec_size_k", problem_space, sf_vec_k);
+
+  
+  set_argument(result, "cluster_m", problem_space, cluster_m);
+  set_argument(result, "cluster_n", problem_space, cluster_n);
+  set_argument(result, "cluster_k", problem_space, cluster_k);
+  set_argument(result, "cluster_m_fallback", problem_space, cluster_m_fallback);
+  set_argument(result, "cluster_n_fallback", problem_space, cluster_n_fallback);
+  set_argument(result, "cluster_k_fallback", problem_space, cluster_k_fallback);
+  
+
+  set_argument(result, "split_k_mode", problem_space, library::to_string(split_k_mode));
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+  set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
+  set_argument(result, "swizzle_size", problem_space, swizzle_size);
+  set_argument(result, "use_pdl", problem_space, library::to_string(use_pdl));
+
+  
+  set_argument(result, "runtime_input_datatype_a", problem_space, library::to_string(runtime_input_datatype_a));
+  set_argument(result, "runtime_input_datatype_b", problem_space, library::to_string(runtime_input_datatype_b));
+  
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status BlockwiseGemmOperationProfiler::initialize_configuration(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem) {
+
+  library::BlockwiseGemmDescription const &operation_desc =
+    static_cast<library::BlockwiseGemmDescription const &>(operation->description());
+
+  if (operation_desc.gemm_kind != library::GemmKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  gemm_workspace_.configuration.mode = problem_.mode;
+  gemm_workspace_.configuration.problem_size.m() = int(problem_.m);
+  gemm_workspace_.configuration.problem_size.n() = int(problem_.n);
+  gemm_workspace_.configuration.problem_size.k() = int(problem_.k);
+  
+  gemm_workspace_.configuration.cluster_shape.m() = int(problem_.cluster_m);
+  gemm_workspace_.configuration.cluster_shape.n() = int(problem_.cluster_n);
+  gemm_workspace_.configuration.cluster_shape.k() = int(problem_.cluster_k);
+  gemm_workspace_.configuration.cluster_shape_fallback.m() = int(problem_.cluster_m_fallback);
+  gemm_workspace_.configuration.cluster_shape_fallback.n() = int(problem_.cluster_n_fallback);
+  gemm_workspace_.configuration.cluster_shape_fallback.k() = int(problem_.cluster_k_fallback);
+  
+  gemm_workspace_.configuration.lda = problem_.lda;
+  gemm_workspace_.configuration.ldb = problem_.ldb;
+  gemm_workspace_.configuration.ldc = problem_.ldc;
+  gemm_workspace_.configuration.ldd = problem_.ldc;
+
+  if (problem_.mode == library::GemmUniversalMode::kBatched) {
+    gemm_workspace_.configuration.batch_count = problem_.batch_count;
+  }
+  else {
+    gemm_workspace_.configuration.batch_count = problem_.split_k_slices;
+  }
+
+  gemm_workspace_.arguments.problem_size.m() = int(problem_.m);
+  gemm_workspace_.arguments.problem_size.n() = int(problem_.n);
+  gemm_workspace_.arguments.problem_size.k() = int(problem_.k);
+  gemm_workspace_.arguments.batch_count = problem_.batch_count;
+
+  gemm_workspace_.arguments.A = nullptr;
+  gemm_workspace_.arguments.B = nullptr;
+  gemm_workspace_.arguments.C = nullptr;
+  gemm_workspace_.arguments.D = nullptr;
+  gemm_workspace_.arguments.alpha = problem_.alpha.data();
+  gemm_workspace_.arguments.beta = problem_.beta.data();
+  gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+  gemm_workspace_.arguments.swizzle_size = problem_.swizzle_size;
+  gemm_workspace_.arguments.raster_order = problem_.raster_order;
+  gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
+  gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+  gemm_workspace_.arguments.split_k_slices = problem_.split_k_slices;
+
+  
+  gemm_workspace_.arguments.runtime_input_datatype_a = problem_.runtime_input_datatype_a;
+  gemm_workspace_.arguments.runtime_input_datatype_b = problem_.runtime_input_datatype_b;
+  
+  gemm_workspace_.arguments.sf_m_vec_size = problem_.sf_vec_m;
+  gemm_workspace_.arguments.sf_n_vec_size = problem_.sf_vec_n;
+  gemm_workspace_.arguments.sf_k_vec_size = problem_.sf_vec_k;
+
+  gemm_workspace_.arguments.use_pdl = problem_.use_pdl;
+
+  // initialize reduction operation for parallel splitKMode
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    if (!initialize_reduction_configuration_(operation, problem)) {
+      return Status::kErrorInternal;
+    }
+  }
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+
+  return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void BlockwiseGemmOperationProfiler::initialize_result_(
+    PerformanceResult &result,
+    Options const &options,
+    library::BlockwiseGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+
+  problem_.initialize_result(result, operation_desc, problem_space);
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+  result.bytes = problem_.bytes(operation_desc);
+  result.flops = problem_.flops(operation_desc);
+  result.runtime = 0;
+
+}
+
+/// Initialize reduction problem dimensions and library::Operation
+bool BlockwiseGemmOperationProfiler::initialize_reduction_configuration_(
+  library::Operation const *operation,
+  ProblemSpace::Problem const &problem) {
+
+  library::BlockwiseGemmDescription const &gemm_desc =
+    static_cast<library::BlockwiseGemmDescription const&>(operation->description());
+
+  if (!cast_from_double(problem_.alpha_one, gemm_desc.element_epilogue, 1)) {
+    return false;
+  }
+
+  if (!cast_from_double(problem_.beta_zero, gemm_desc.element_epilogue, 0)) {
+    return false;
+  }
+
+  /// initialize library::ReductionConfiguration
+  gemm_workspace_.reduction_configuration.problem_size      = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn();
+  gemm_workspace_.reduction_configuration.partitions        = int(problem_.split_k_slices);
+  gemm_workspace_.reduction_configuration.partition_stride  = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn().product();
+  gemm_workspace_.reduction_configuration.ldw               = problem_.ldc;
+  gemm_workspace_.reduction_configuration.lds               = problem_.ldc;
+  gemm_workspace_.reduction_configuration.ldd               = problem_.ldc;
+
+  // find reduction operation
+  library::ReductionFunctionalKey reduction_key(
+    library::Provider::kCUTLASS,
+    gemm_desc.tile_description.math_instruction.element_accumulator,    // element workspace
+    gemm_desc.tile_description.math_instruction.element_accumulator,    // element accumulator
+    gemm_desc.D.element,                                                // element output
+    gemm_desc.element_epilogue                                          // element compute
+  );
+
+  auto reduction_it = library::Singleton::get().operation_table.reduction_operations.find(reduction_key);
+
+  if (reduction_it == library::Singleton::get().operation_table.reduction_operations.end()) {
+    return false;
+  }
+
+  // initialize reduction operation required for parallel split-k operator
+  reduction_op_ = reduction_it->second;
+
+  // reduction operation found and initialized
+  return true;
+}
+
+/// Initializes workspace
+Status BlockwiseGemmOperationProfiler::initialize_workspace(
+  Options const &options,
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::Operation const* underlying_operation = operation;
+
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_gemm_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  library::BlockwiseGemmDescription const &operation_desc =
+    static_cast<library::BlockwiseGemmDescription const &>(operation->description());
+
+  // Compute the number of copies of the problem to avoid L2 camping.
+  if (!options.profiling.workspace_count) {
+    int64_t bytes = problem_.bytes(operation_desc);
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
+      gemm_workspace_.problem_count =
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
+    }
+    else {
+      gemm_workspace_.problem_count = 1;
+    }
+  }
+  else {
+    gemm_workspace_.problem_count = options.profiling.workspace_count;
+  }
+
+  bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
+  if (allocate_device_tensors) {
+    int seed_shift = 0;
+    gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
+      options,
+      "A",
+      operation_desc.A.element,
+      operation_desc.A.layout,
+      {int(problem_.m), int(problem_.k)},
+      {int(problem_.lda)},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
+    );
+
+    int sfa_m     = ceil_div(int(problem_.m), operation_desc.SFMVecSize);
+    int sfb_n     = ceil_div(int(problem_.n), operation_desc.SFNVecSize);
+    int sfa_sfb_k = ceil_div(int(problem_.k), operation_desc.SFKVecSize);
+
+    gemm_workspace_.SFA = device_context.allocate_and_initialize_tensor(
+      options,
+      "SFA",
+      operation_desc.SFA.element,
+      operation_desc.SFA.layout,
+      {sfa_m, sfa_sfb_k},
+      {sfa_m},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
+    );
+
+    gemm_workspace_.SFB = device_context.allocate_and_initialize_tensor(
+      options,
+      "SFB",
+      operation_desc.SFB.element,
+      operation_desc.SFB.layout,
+      {sfa_sfb_k, sfb_n},
+      {sfb_n},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
+    );
+
+    gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      {int(problem_.k), int(problem_.n)},
+      {int(problem_.ldb)},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
+    );
+
+    gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
+    );
+
+    gemm_workspace_.Computed = device_context.allocate_tensor(
+      options,
+      "D",
+      operation_desc.D.element,
+      operation_desc.D.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      0 // device_index
+    );
+
+    gemm_workspace_.Reference = device_context.allocate_tensor(
+      options,
+      "Reference",
+      operation_desc.D.element,
+      operation_desc.D.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)},
+      problem_.batch_count * gemm_workspace_.problem_count,
+      0 // device_index
+    );
+  }
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
+    gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
+    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
+    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+    gemm_workspace_.arguments.split_k_slices = problem_.split_k_slices;
+    gemm_workspace_.arguments.batch_count = problem_.batch_count;
+    gemm_workspace_.arguments.lda = problem_.lda;
+    gemm_workspace_.arguments.ldb = problem_.ldb;
+    gemm_workspace_.arguments.ldc = problem_.ldc;
+    gemm_workspace_.arguments.ldd = problem_.ldc;
+    gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
+    gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
+    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
+    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+    gemm_workspace_.arguments.use_pdl = problem_.use_pdl;
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    gemm_workspace_.arguments.sm_count = options.device.properties[0].multiProcessorCount;
+  }
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+      uint64_t workspace_size = underlying_operation->get_host_workspace_size(&gemm_workspace_.configuration);
+      gemm_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
+                                                            &gemm_workspace_.arguments);
+      gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = underlying_operation->initialize(
+        &gemm_workspace_.configuration,
+        gemm_workspace_.host_workspace.data(),
+        gemm_workspace_.device_workspace.data());
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        workspace_size = reduction_op_->get_host_workspace_size(&gemm_workspace_.reduction_configuration);
+        gemm_workspace_.reduction_host_workspace.resize(workspace_size, 0);
+
+        status = reduction_op_->initialize(
+          &gemm_workspace_.reduction_configuration,
+          gemm_workspace_.reduction_host_workspace.data(),
+          nullptr);
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kGemm;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for (auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool BlockwiseGemmOperationProfiler::verify_cutlass(
+  Options const &options,
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  // Initialize structure containing GEMM arguments
+  gemm_workspace_.arguments.A = gemm_workspace_.A->data();
+  gemm_workspace_.arguments.B = gemm_workspace_.B->data();
+  gemm_workspace_.arguments.SFA = gemm_workspace_.SFA->data();
+  gemm_workspace_.arguments.SFB = gemm_workspace_.SFB->data();
+  gemm_workspace_.arguments.C = gemm_workspace_.C->data();
+  gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
+  gemm_workspace_.arguments.alpha = problem_.alpha.data();
+  gemm_workspace_.arguments.beta = problem_.beta.data();
+  gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+  gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
+  gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
+  gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
+  gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
+    gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
+    gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
+
+    gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
+    gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
+    gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
+    gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
+    gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
+    gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+  }
+
+  //
+  // Run the CUTLASS operation
+  //
+
+  // initialize gemm underlying operation to handle parallel reduction
+  library::Operation const * underlying_operation = operation;
+
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_gemm_operation_for_parallel_reduction(operation))) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+  }
+
+  results_.back().status = underlying_operation->run(
+    &gemm_workspace_.arguments,
+    gemm_workspace_.host_workspace.data(),
+    gemm_workspace_.device_workspace.data(),
+    nullptr);
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // Run parallel reduction kernel for parallel split_k_mode
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    results_.back().status = reduction_op_->run(
+      &gemm_workspace_.reduction_arguments,
+      gemm_workspace_.reduction_host_workspace.data(),
+      nullptr,
+      nullptr);
+
+    if (results_.back().status != Status::kSuccess) {
+      results_.back().disposition = Disposition::kFailed;
+      return false;
+    }
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUBLAS
+    if (options.verification.provider_enabled(library::Provider::kCUBLAS)) {
+      // set verification map for cublas to not supported
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotSupported;
+    }
+#endif // #if CUTLASS_ENABLE_CUBLAS
+
+    
+    cutlass::library::RuntimeDatatype runtime_datatype_a = gemm_workspace_.arguments.runtime_input_datatype_a;
+    cutlass::library::RuntimeDatatype runtime_datatype_b = gemm_workspace_.arguments.runtime_input_datatype_b;
+
+    bool is_runtime_datatype_a = runtime_datatype_a != cutlass::library::RuntimeDatatype::kStatic;
+    bool is_runtime_datatype_b = runtime_datatype_b != cutlass::library::RuntimeDatatype::kStatic;
+
+    assert(is_runtime_datatype_a == is_runtime_datatype_b && "runtime datatype should be both dynamic or static.");
+    
+    library::OperationDescription const &desc = operation->description();
+    auto &gemm_desc = static_cast<library::BlockwiseGemmDescription const &>(desc);
+
+    cutlass::library::NumericTypeID element_A = gemm_desc.A.element;
+    cutlass::library::NumericTypeID element_B = gemm_desc.B.element;
+    
+    if (is_runtime_datatype_a) {
+      element_A = cutlass::library::dynamic_datatype_to_id(runtime_datatype_a);
+    }
+
+    if (is_runtime_datatype_b) {
+      element_B = cutlass::library::dynamic_datatype_to_id(runtime_datatype_b);
+    }
+    
+
+    bool verification_status = verify_with_reference_(options, report, device_context, operation, problem_space, problem, element_A, element_B);
+
+    // Update disposition to worst case verification outcome among all
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for (auto &m : results_.back().verification_map) {
+      if (m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if (!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if (is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // if verification.required is set, then return success iff at least one ref-check was run
+  if (options.verification.required) {
+    bool did_any_verification_run = false;
+    for (auto provider : options.verification.providers) {
+      did_any_verification_run |= (Disposition::kNotRun != results_.back().verification_map[provider]);
+    }
+
+    if (not did_any_verification_run) {
+      results_.back().status = Status::kErrorNotSupported;
+      return false;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool BlockwiseGemmOperationProfiler::verify_with_cublas_(
+  Options const &options,
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+#if CUTLASS_ENABLE_CUBLAS
+  std::cerr << "cuBLAS is not supported" << std::endl;
+#endif
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against host and device references
+bool BlockwiseGemmOperationProfiler::verify_with_reference_(
+  Options const &options,
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem,
+  cutlass::library::NumericTypeID element_A,
+  cutlass::library::NumericTypeID element_B) {
+
+  /// Verifies CUTLASS against host reference
+
+  //
+  // Find host reference operation using conv2d functional description key
+  //
+  library::OperationDescription const &desc = operation->description();
+
+  auto &gemm_desc = static_cast<library::BlockwiseGemmDescription const &>(desc);
+
+  library::BlockwiseGemmFunctionalKey blockwiseGemm_key(
+    library::Provider::kReferenceHost,
+    gemm_desc.gemm_kind,
+    gemm_desc.kind,
+    gemm_desc.tile_description.math_instruction.element_accumulator,
+    gemm_desc.element_epilogue,
+    element_A,
+    gemm_desc.A.layout,
+    gemm_desc.SFA.element,
+    element_B,
+    gemm_desc.B.layout,
+    gemm_desc.SFB.element,
+    gemm_desc.C.element,
+    gemm_desc.C.layout,
+    gemm_desc.D.element,
+    gemm_desc.D.layout,
+    gemm_desc.SFMVecSize,
+    gemm_desc.SFNVecSize,
+    gemm_desc.SFKVecSize
+  );
+
+  auto operators_it = library::Singleton::get().operation_table.blockwise_gemm_operations.find(blockwiseGemm_key);
+
+  if (operators_it == library::Singleton::get().operation_table.blockwise_gemm_operations.end()) {
+    return true;
+  }
+
+  if (operators_it->second.empty()) {
+    return true;
+  }
+
+  // Not use preference to filter the reference kernel.
+  auto cc_it = operators_it->second.begin();
+
+  if(cc_it == operators_it->second.end()) {
+    std::cout<< "not find any reference kernel" << std::endl;
+    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+    return true;
+  }
+
+  // host reference has only one instances in BlockwiseOperationVectorMap
+  library::Operation const *reference_op = cc_it->second[0];
+
+  // To support the host-side reference, conditionally allocate and
+  // copy tensors to host memory.
+  std::vector<uint8_t> host_data_A;
+  std::vector<uint8_t> host_data_SFA;
+  std::vector<uint8_t> host_data_B;
+  std::vector<uint8_t> host_data_SFB;
+  std::vector<uint8_t> host_data_C;
+  std::vector<uint8_t> host_data_D;
+
+  //
+  // Copy input tensors A, B, and C from device to host buffers
+  //
+
+  host_data_A.resize(gemm_workspace_.A->bytes());
+  void * ptr_A = host_data_A.data();
+  gemm_workspace_.A->copy_to_host(ptr_A);
+
+  host_data_SFA.resize(gemm_workspace_.SFA->bytes());
+  void * ptr_SFA = host_data_SFA.data();
+  gemm_workspace_.SFA->copy_to_host(ptr_SFA);
+
+  host_data_B.resize(gemm_workspace_.B->bytes());
+  void * ptr_B = host_data_B.data();
+  gemm_workspace_.B->copy_to_host(ptr_B);
+
+  host_data_SFB.resize(gemm_workspace_.SFB->bytes());
+  void * ptr_SFB = host_data_SFB.data();
+  gemm_workspace_.SFB->copy_to_host(ptr_SFB);
+
+  host_data_C.resize(gemm_workspace_.C->bytes());
+  void * ptr_C = host_data_C.data();
+  gemm_workspace_.C->copy_to_host(ptr_C);
+  
+  host_data_D.resize(gemm_workspace_.Reference->bytes());
+  void * ptr_D = host_data_D.data();
+
+  /// Set reference kernel Arguments
+
+  library::BlockwiseGemmArguments arguments {
+    {int(problem_.m), int(problem_.n), int(problem_.k)},
+    {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)},
+    {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)},
+    gemm_workspace_.configuration.batch_count,
+    ptr_A,
+    ptr_B,
+    ptr_SFA,
+    ptr_SFB,
+    ptr_C,
+    ptr_D,
+    problem_.alpha.data(),
+    problem_.beta.data(),
+    library::ScalarPointerMode::kHost,
+    int(gemm_workspace_.configuration.lda),
+    int(gemm_workspace_.configuration.ldb),
+    int(gemm_workspace_.configuration.ldc),
+    int(gemm_workspace_.configuration.ldd),
+    gemm_workspace_.A->batch_stride(),
+    gemm_workspace_.B->batch_stride(),
+    gemm_workspace_.C->batch_stride(),
+    gemm_workspace_.Reference->batch_stride()
+  };
+
+  // Query host work space size
+  uint64_t host_workspace_size_needed = reference_op->get_host_workspace_size(&gemm_workspace_.configuration);
+
+  std::vector<char> host_workspace(host_workspace_size_needed);
+
+  // Query device workspace size
+  uint64_t device_workspace_size_needed = reference_op->get_device_workspace_size(&gemm_workspace_.configuration);
+  // Initialize host and device workspaces
+  Status status = reference_op->initialize(
+    &gemm_workspace_.configuration,
+    host_workspace.data()
+  );
+
+  if (status != cutlass::Status::kSuccess) {
+    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
+    return true;
+  }
+
+  // Run the operator
+  status = reference_op->run(&arguments, host_workspace.data());
+
+  results_.back().status = status;
+
+  gemm_workspace_.Reference->copy_from_host(ptr_D);
+
+  //
+  // Verify results
+  //
+  auto resultD = compare_tensors(
+    options,
+    *gemm_workspace_.Computed,
+    *gemm_workspace_.Reference,
+    gemm_workspace_.Computed->batch_stride()
+  );
+  
+  results_.back().verification_map[library::Provider::kReferenceHost] = resultD;
+
+  // Save workspace if incorrect
+  if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
+    results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
+    save_workspace(
+      device_context,
+      options,
+      gemm_desc,
+      library::Provider::kCUTLASS,
+      library::Provider::kReferenceHost);
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Measures performance results
+bool BlockwiseGemmOperationProfiler::profile(
+  Options const &options,
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing GEMM arguments
+    gemm_workspace_.arguments.A = gemm_workspace_.A->data();
+    gemm_workspace_.arguments.B = gemm_workspace_.B->data();
+    gemm_workspace_.arguments.SFA = gemm_workspace_.SFA->data();
+    gemm_workspace_.arguments.SFB = gemm_workspace_.SFB->data();
+    gemm_workspace_.arguments.C = gemm_workspace_.C->data();
+    gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
+    gemm_workspace_.arguments.alpha = problem_.alpha.data();
+    gemm_workspace_.arguments.beta = problem_.beta.data();
+    gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
+    gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
+    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
+    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
+      gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
+      gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
+
+      gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
+      gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
+      gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
+      gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
+      gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
+      gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+    }
+
+    results_.back().status = profile_cutlass_(
+      results_.back(),
+      options,
+      operation,
+      &gemm_workspace_.arguments,
+      gemm_workspace_.host_workspace.data(),
+      gemm_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Method to profile a CUTLASS Operation
+Status BlockwiseGemmOperationProfiler::profile_cutlass_(
+  PerformanceResult &result,
+  Options const &options,
+  library::Operation const *operation,
+  void *arguments,
+  void *host_workspace,
+  void *device_workspace) {
+
+  // initialize gemm underlying operation to handle parallel reduction
+  library::Operation const * underlying_operation = operation;
+
+  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+    if (!(underlying_operation = library::find_gemm_operation_for_parallel_reduction(operation))) {
+      return Status::kErrorNotSupported;
+    }
+  }
+
+  auto func = [&](cudaStream_t, int iteration) {
+    // Iterate over copies of the problem in memory
+    int problem_idx = (iteration % gemm_workspace_.problem_count) * problem_.batch_count;
+
+    gemm_workspace_.arguments.A = gemm_workspace_.A->batch_data(problem_idx);
+    gemm_workspace_.arguments.B = gemm_workspace_.B->batch_data(problem_idx);
+    gemm_workspace_.arguments.C = gemm_workspace_.C->batch_data(problem_idx);
+    gemm_workspace_.arguments.D = gemm_workspace_.Computed->batch_data(problem_idx);
+
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      gemm_workspace_.arguments.D                     = gemm_workspace_.device_workspace.data();
+
+      gemm_workspace_.reduction_arguments.workspace   = gemm_workspace_.device_workspace.data();
+      gemm_workspace_.reduction_arguments.source      = gemm_workspace_.C->batch_data(problem_idx);
+      gemm_workspace_.reduction_arguments.destination = gemm_workspace_.Computed->batch_data(problem_idx);
+    }
+
+    Status status = underlying_operation->run(
+      arguments,
+      host_workspace,
+      device_workspace,
+      nullptr);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      status = reduction_op_->run(
+        &gemm_workspace_.reduction_arguments,
+        gemm_workspace_.reduction_host_workspace.data(),
+        nullptr,
+        nullptr);
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    return status;
+  };
+
+  return profile_kernel_(result, options, func);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index 6ecee707..6e3fb4b8 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -37,6 +37,7 @@
 
 // Profiler includes
 #include "cutlass/profiler/block_scaled_gemm_operation_profiler.h"
+#include "cutlass/profiler/blockwise_gemm_operation_profiler.h"
 #include "cutlass/profiler/conv2d_operation_profiler.h"
 #include "cutlass/profiler/conv3d_operation_profiler.h"
 #include "cutlass/profiler/cutlass_profiler.h"
@@ -64,6 +65,8 @@ CutlassProfiler::CutlassProfiler(
 
   operation_profilers_.emplace_back(new BlockScaledGemmOperationProfiler(options));   
 
+  operation_profilers_.emplace_back(new BlockwiseGemmOperationProfiler(options));   
+
   operation_profilers_.emplace_back(new SparseGemmOperationProfiler(options));
 
   operation_profilers_.emplace_back(new Conv2dOperationProfiler(options));
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 779cfdb1..b0a72553 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -440,10 +440,11 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
 
-  
-  set_argument(result, "cluster_m", problem_space, cluster_m);
-  set_argument(result, "cluster_n", problem_space, cluster_n);
-  set_argument(result, "cluster_k", problem_space, cluster_k);
+  auto cluster_shape = operation_desc.tile_description.cluster_shape;
+  auto is_dynamic = cluster_shape.m() == 0 || cluster_shape.n() == 0 || cluster_shape.k() == 0;
+  set_argument(result, "cluster_m", problem_space, is_dynamic ? this->cluster_m : cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, is_dynamic ? this->cluster_n : cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, is_dynamic ? this->cluster_k : cluster_shape.k());
   set_argument(result, "cluster_m_fallback", problem_space, cluster_m_fallback);
   set_argument(result, "cluster_n_fallback", problem_space, cluster_n_fallback);
   set_argument(result, "cluster_k_fallback", problem_space, cluster_k_fallback);
diff --git a/tools/profiler/src/grouped_gemm_operation_profiler.cu b/tools/profiler/src/grouped_gemm_operation_profiler.cu
index e4d1e6bb..ff702f6a 100644
--- a/tools/profiler/src/grouped_gemm_operation_profiler.cu
+++ b/tools/profiler/src/grouped_gemm_operation_profiler.cu
@@ -40,6 +40,7 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include <regex>
 
 #include <cuda_runtime_api.h>
 
@@ -459,9 +460,11 @@ void GroupedGemmOperationProfiler::GroupedGemmProblem::initialize_result(
     set_argument(result, "problem-sizes", problem_space, ss.str());
   }
 
-  set_argument(result, "cluster_m", problem_space, cluster_m);
-  set_argument(result, "cluster_n", problem_space, cluster_n);
-  set_argument(result, "cluster_k", problem_space, cluster_k);
+  auto cluster_shape = operation_desc.gemm.tile_description.cluster_shape;
+  auto is_dynamic = cluster_shape.m() == 0 || cluster_shape.n() == 0 || cluster_shape.k() == 0;
+  set_argument(result, "cluster_m", problem_space, is_dynamic ? this->cluster_m : cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, is_dynamic ? this->cluster_n : cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, is_dynamic ? this->cluster_k : cluster_shape.k());
   set_argument(result, "cluster_m_fallback", problem_space, cluster_m_fallback);
   set_argument(result, "cluster_n_fallback", problem_space, cluster_n_fallback);
   set_argument(result, "cluster_k_fallback", problem_space, cluster_k_fallback);
@@ -497,10 +500,22 @@ Status GroupedGemmOperationProfiler::initialize_configuration(
   // We distinguish between block scaled and non-block scaled operations by looking at the kernel
   // name, which tells us what reference kernel to use, which arguments to pass to the operation
   // etc. This avoids creating yet another OperationProfiler with a lot of boilerplate in it.
+
+  std::string sf_tuple = "\\d+x\\d+";
+  std::string datatypes_regex = "\\w?f\\d+|e\\dm\\d"; // bf16 | f16 | f32 | e4m3 | ...
+  std::string blockwise_regex_string = sf_tuple + "(" +  datatypes_regex + ")x(" + 
+                                       datatypes_regex + ")_" + sf_tuple + "(" + 
+                                       datatypes_regex + ")x(" + datatypes_regex + ")";
+                                       
+
   if (std::string(operation_desc.gemm.name).find("bstensor") != std::string::npos) {
     is_block_scaled = true;
     gemm_workspace_.block_scales = BlockScalingWorkspace{};
   }
+  else if (std::regex_search(operation_desc.gemm.name, std::regex(blockwise_regex_string))) {
+    is_blockwise = true;
+    gemm_workspace_.block_scales = BlockScalingWorkspace{};
+  }
   else {
     is_block_scaled = false;
     gemm_workspace_.block_scales = std::nullopt;
@@ -605,6 +620,12 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       block_scaling_ws.SFD_ptr_array_host.resize(num_groups);
       block_scaling_ws.SFD_reference_ptr_array_host.resize(num_groups);
     }
+    else if (is_blockwise) {
+      auto& block_scaling_ws = gemm_workspace_.block_scales.value();
+      block_scaling_ws.SFA_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFB_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFC_ptr_array_host.resize(num_groups);
+    }
     static_assert(sizeof(void*) == 8); // allocating blocks for pointers, so verify pointer size
     // ldx
     gemm_workspace_.lda_array_device =
@@ -698,7 +719,7 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
         int sfa_m = round_up(int(problem_.m(group_idx)), 128);
         int sfb_n = round_up(int(problem_.n(group_idx)), 128);
         int sfa_sfb_k =
-          round_up(ceil_div(int(problem_.k(group_idx)), block_scale_desc.SFVecSize), 4);
+          round_up(ceil_div(int(problem_.k(group_idx)), block_scale_desc.SFKVecSize), 4);
 
         int sfd_m =
           block_scale_desc.SFD.layout == cutlass::library::LayoutTypeID::kRowMajor
@@ -760,6 +781,37 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
           block_scale_ws.SFD_ptr_array_host[group_idx]->fill_device(0);
         }
       }
+      else if (is_blockwise) {
+        auto const block_scale_desc = operation_desc.block_scales.value();
+        auto& block_scale_ws = gemm_workspace_.block_scales.value();
+        int sfa_m     = ceil_div(int(problem_.m(group_idx)), block_scale_desc.SFMVecSize);
+        int sfb_n     = ceil_div(int(problem_.n(group_idx)), block_scale_desc.SFNVecSize);
+        int sfa_sfb_k = ceil_div(int(problem_.k(group_idx)), block_scale_desc.SFKVecSize);
+
+        block_scale_ws.SFA_ptr_array_host[group_idx] =
+          device_context.allocate_and_initialize_tensor(
+            options,
+            "SFA_" + std::to_string(group_idx),
+            block_scale_desc.SFA.element,
+            block_scale_desc.SFA.layout,
+            {sfa_m, sfa_sfb_k},
+            {sfa_m},
+            gemm_workspace_.problem_count,
+            seed_shift++,
+            0);
+
+        block_scale_ws.SFB_ptr_array_host[group_idx] =
+          device_context.allocate_and_initialize_tensor(
+            options,
+            "SFB_" + std::to_string(group_idx),
+            block_scale_desc.SFB.element,
+            block_scale_desc.SFB.layout,
+            {sfa_sfb_k, sfb_n},
+            {sfb_n},
+            gemm_workspace_.problem_count,
+            seed_shift++,
+            0);
+      }
     }
 
     // takes the allocated tensors and initializes an array of pointers per problem in the workspace
@@ -825,6 +877,18 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
         0 // device_index
       );
     }
+    else if (is_blockwise) {
+      auto& block_scale_ws = gemm_workspace_.block_scales.value();
+      create_dev_ptr_array_all_workspace(
+        block_scale_ws.SFA_ptr_array_device,
+        block_scale_ws.SFA_ptr_array_host,
+        "SFA");
+      create_dev_ptr_array_all_workspace(
+        block_scale_ws.SFB_ptr_array_device,
+        block_scale_ws.SFB_ptr_array_host,
+        "SFB");
+    }
+
     init_arguments(options);
   }
 
@@ -896,6 +960,11 @@ bool GroupedGemmOperationProfiler::verify_cutlass(
   init_arguments(options);
 
   library::Operation const* underlying_operation = operation;
+  results_.back().status = underlying_operation->initialize_with_arguments(&gemm_workspace_.arguments);
+  if (results_.back().status != Status::kSuccess) {
+    return false;
+  }
+
   results_.back().status = underlying_operation->run(
     &gemm_workspace_.arguments,
     gemm_workspace_.host_workspace.data(),
@@ -998,7 +1067,7 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
     }
 
     // we only have a block scaled reference kernel implemented on the host
-    if (is_block_scaled && provider != library::Provider::kReferenceHost) {
+    if ((is_block_scaled || is_blockwise) && provider != library::Provider::kReferenceHost) {
       continue;
     }
 
@@ -1064,12 +1133,22 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
           ptr_norm_constant = host_data_norm_constant.data();
           ws.norm_constant->copy_to_host(ptr_norm_constant);
         }
+        else if (is_blockwise) {
+          auto const& ws = gemm_workspace_.block_scales.value();
+
+          host_data_SFA.resize(ws.SFA_ptr_array_host[group_idx]->bytes());
+          ptr_SFA = host_data_SFA.data();
+          ws.SFA_ptr_array_host[group_idx]->copy_to_host(ptr_SFA);
+          host_data_SFB.resize(ws.SFB_ptr_array_host[group_idx]->bytes());
+          ptr_SFB = host_data_SFB.data();
+          ws.SFB_ptr_array_host[group_idx]->copy_to_host(ptr_SFB);
+        }
       }
 
       const auto &desc = static_cast<library::GroupedGemmDescription const &>(operation->description());
       const auto& gemm_desc = desc.gemm;
 
-      if (!is_block_scaled) {
+      if (!is_block_scaled and !is_blockwise) {
         library::Handle handle;
         handle.set_provider(provider);
 
@@ -1112,7 +1191,7 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
           gemm_workspace_.C_ptr_array_host[group_idx]->batch_stride(),
           gemm_workspace_.reference_ptr_array_host[group_idx]->batch_stride());
       }
-      else {
+      else if (is_block_scaled) {
         auto const& block_scale_desc = desc.block_scales.value();
         auto& block_scale_ws = gemm_workspace_.block_scales.value();
 
@@ -1134,7 +1213,7 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
           gemm_desc.D.layout,
           block_scale_desc.SFD.element,
           block_scale_desc.SFD.layout,
-          block_scale_desc.SFVecSize,
+          block_scale_desc.SFKVecSize,
           block_scale_desc.EpilogueSFVecSize);
 
         auto operators_it =
@@ -1208,6 +1287,100 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
 
         block_scale_ws.SFD_reference_ptr_array_host[group_idx]->copy_from_host(ptr_SFD);
       }
+      else {
+        // Blockwise
+        auto const& block_scale_desc = desc.block_scales.value();
+        auto& block_scale_ws = gemm_workspace_.block_scales.value();
+
+        library::BlockwiseGemmFunctionalKey blockwiseGemm_key(
+          library::Provider::kReferenceHost,
+          library::GemmKind::kUniversal,
+          library::OperationKind::kBlockwiseGemm,
+          gemm_desc.tile_description.math_instruction.element_accumulator,
+          gemm_desc.element_epilogue,
+          element_A,
+          gemm_desc.A.layout,
+          block_scale_desc.SFA.element,
+          element_B,
+          gemm_desc.B.layout,
+          block_scale_desc.SFB.element,
+          gemm_desc.C.element,
+          gemm_desc.C.layout,
+          gemm_desc.D.element,
+          gemm_desc.D.layout,
+          block_scale_desc.SFMVecSize,
+          block_scale_desc.SFNVecSize,
+          block_scale_desc.SFKVecSize
+        );
+
+        auto operators_it = library::Singleton::get().operation_table.blockwise_gemm_operations.find(blockwiseGemm_key);
+        if (
+          operators_it ==
+          library::Singleton::get().operation_table.blockwise_gemm_operations.end()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        if (operators_it->second.empty()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        auto cc_it = operators_it->second.begin();
+        if (cc_it == operators_it->second.end()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        // host reference has only one instances in BlockScaledOperationVectorMap
+        library::Operation const* reference_op = cc_it->second[0];
+
+        library::BlockwiseGemmArguments arguments {
+          {int(problem_.m(group_idx)), int(problem_.n(group_idx)), int(problem_.k(group_idx))},
+          {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)},
+          {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)},
+          1, // batch_count
+          ptr_A,
+          ptr_B,
+          ptr_SFA,
+          ptr_SFB,
+          ptr_C,
+          ptr_D,
+          problem_.alpha.data(),
+          problem_.beta.data(),
+          library::ScalarPointerMode::kHost,
+          problem_.lda[group_idx],
+          problem_.ldb[group_idx],
+          problem_.ldc[group_idx],
+          problem_.ldc[group_idx],
+          gemm_workspace_.A_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.B_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.C_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.reference_ptr_array_host[group_idx]->batch_stride(),
+        };
+
+        library::GemmUniversalConfiguration configuration{
+          library::GemmUniversalMode::kGemm,
+          problem_.problem_sizes[group_idx],
+          {problem_.cluster_m, problem_.cluster_n, problem_.cluster_k},
+          {problem_.cluster_m_fallback, problem_.cluster_n_fallback, problem_.cluster_k_fallback},
+          1,
+          problem_.lda[group_idx],
+          problem_.ldb[group_idx],
+          problem_.ldc[group_idx],
+          problem_.ldc[group_idx],
+          1,
+        };
+        uint64_t host_workspace_size_needed = reference_op->get_host_workspace_size(&gemm_workspace_.configuration);
+        std::vector<char> host_workspace(host_workspace_size_needed);
+        status = reference_op->initialize(&configuration, host_workspace.data());
+        if (status != Status::kSuccess) {
+          break;
+        }
+
+        status = reference_op->run(&arguments, host_workspace.data());
+      }
+
       if (status != Status::kSuccess) {
         break;
       }
@@ -1292,6 +1465,10 @@ Status GroupedGemmOperationProfiler::profile_cutlass_(
   void* device_workspace) {
 
   library::Operation const* underlying_operation = operation;
+  results_.back().status = underlying_operation->initialize_with_arguments(&gemm_workspace_.arguments);
+  if (results_.back().status != Status::kSuccess) {
+    return results_.back().status;
+  }
 
   auto func = [&](cudaStream_t stream, int iteration) {
     // Iterate over copies of the problem in memory
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 3f071e89..387640b2 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -301,6 +301,9 @@ std::ostream& operator<<(std::ostream& out, library::OperationKind op_kind) {
   else if (op_kind == library::OperationKind::kBlockScaledGemm) {
     out << "kBlockScaledGemm";
   }
+  else if (op_kind == library::OperationKind::kBlockwiseGemm) {
+    out << "kBlockwiseGemm";
+  }
   else if (op_kind == library::OperationKind::kRankK) {
     out << "kRankK";
   }