v4.2 release. (#2587)

* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
2025-08-23 06:11:24 +08:00
parent 11cad1f67b
commit a49a78ffef
351 changed files with 28182 additions and 2032 deletions
--- a/examples/35_gemm_softmax/gemm_softmax.cu
+++ b/examples/35_gemm_softmax/gemm_softmax.cu
@ -659,7 +659,7 @@ struct Testbed {
    }

    int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
-    int64_t bytes = cutlass::bits_to_bytes(
+    int64_t bytes = cutlass::bits_to_bytes<int64_t>(
      (cutlass::sizeof_bits<ElementD>::value * 2 + cutlass::sizeof_bits<ElementSoftmax>::value) *
      options.problem_size.m() * options.problem_size.n());

--- a/examples/39_gemm_permute/layouts.h
+++ b/examples/39_gemm_permute/layouts.h
@ -33,8 +33,8 @@
    computing reference permutations of 4/5D tensors when source data is column-major.
 */
 #pragma once
-#include <cuda/std/cassert>
 #include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/coord.h"
--- a/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
@ -40,14 +40,12 @@
  Note that in general the fragment passed to the OutputOp could
  span multiple rows but it does not happen with the configurations we have
 */
-
 #pragma once

-#include <cuda/std/cassert>
-
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/array.h"
 #include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
 #include "cutlass/functional.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/layout/vector.h"
--- a/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
@ -42,12 +42,10 @@
 */

 #pragma once
-
-#include <cuda/std/cassert>
-
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/array.h"
-#include "cutlass/cutlass.h"
 #include "cutlass/functional.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/layout/vector.h"
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
@ -38,10 +38,8 @@
 */

 #pragma once
-
-#include <cuda/std/cassert>
-
 #include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/vector.h"
--- a/examples/45_dual_gemm/threadblock/dual_epilogue.h
+++ b/examples/45_dual_gemm/threadblock/dual_epilogue.h
@ -37,12 +37,10 @@
 */

 #pragma once
-
-#include <cuda/std/cassert>
-
+#include "cutlass/array.h"
+#include CUDA_STD_HEADER(cassert)
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
 #include "cutlass/layout/vector.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@ -132,7 +132,7 @@ constexpr int ScaleGranularityK = 128;
 constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
 constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;

-using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;

 using LayoutSFA     = decltype(ScaleConfig::deduce_layoutSFA());    // Layout type for SFA matrix operand
 using LayoutSFB     = decltype(ScaleConfig::deduce_layoutSFB());    // Layout type for SFB matrix operand
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
@ -142,7 +142,7 @@ static constexpr int ScaleGranularityK = 128;
 static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
 static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;

-using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+using ScaleConfig   = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;

 using LayoutSFA     = decltype(ScaleConfig::deduce_layoutSFA());    // Layout type for SFA matrix operand
 using LayoutSFB     = decltype(ScaleConfig::deduce_layoutSFB());    // Layout type for SFB matrix operand
--- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
@ -454,11 +454,12 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));  
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+
  if (props.major != 10 || props.minor != 0) {
    std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
    return 0;
-  } 
-  
+  }
+
  //
  // Parse options
  //
--- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
@ -640,11 +640,11 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+
  if (props.major != 10 || props.minor != 0) {
    std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
    return 0;
-  } 
-  
+  }

  //
  // Parse options
--- a/examples/70_blackwell_gemm/CMakeLists.txt
+++ b/examples/70_blackwell_gemm/CMakeLists.txt
@ -33,7 +33,7 @@ set(TEST_SWIZZLE_2 --swizzle=2)
 set(TEST_SWIZZLE_5 --swizzle=5)
 set(TEST_SWIZZLE_5_UNEVEN --swizzle=5 --m=4096 --n=16384)

-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
 cutlass_example_add_executable(
  70_blackwell_fp16_gemm
  70_blackwell_fp16_gemm.cu
--- a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
+++ b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
@ -449,9 +449,9 @@ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MIN
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  
-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
+
+  if (props.major != 10 || props.minor != 0) {
+    std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
    return 0;
  }

--- a/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt
+++ b/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt
@ -27,7 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 # Both filenames are shorter to avoid MAX_PATH issues on Windows.
-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
 cutlass_example_add_executable(
  71_blackwell_gemm_with_collective_builder
  71_blackwell_gemm_with_collective_builder.cu
--- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
@ -116,7 +116,7 @@ using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // O

 // Kernel Perf config
 using MmaTileShape        = Shape<_256,_256,_256>;                          // MMA's tile size
-using ClusterShape        = Shape<_4,_4,_1>;                                // Shape of the threadblocks in a cluster
+using ClusterShape        = Shape<_2,_4,_1>;                                // Shape of the threadblocks in a cluster

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    ArchTag, OperatorClass,
@ -511,10 +511,10 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
+  if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+    std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
    return 0;
-  }
+  } 

  //
  // Parse options
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@ -566,8 +566,8 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
+  if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+    std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
    return 0;
  }

--- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
@ -117,7 +117,7 @@ using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // O

 // Kernel Perf config
 using MmaTileShape        = Shape<_256,_256,_256>;                          // MMA's tile size
-using ClusterShape        = Shape<_4,_4,_1>;                                // Shape of the threadblocks in a cluster
+using ClusterShape        = Shape<_2,_4,_1>;                                // Shape of the threadblocks in a cluster

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    ArchTag, OperatorClass,
@ -512,8 +512,8 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
+  if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+    std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
    return 0;
  }

--- a/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt
+++ b/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt
@ -28,7 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
 cutlass_example_add_executable(
  72a_blackwell_nvfp4_bf16_gemm
  72a_blackwell_nvfp4_bf16_gemm.cu
--- a/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt
+++ b/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt
@ -28,7 +28,7 @@



-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
 cutlass_example_add_executable(
  73_blackwell_gemm_preferred_cluster
  blackwell_gemm_preferred_cluster.cu
--- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
+++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
@ -513,7 +513,7 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

  if (props.major != 10 || props.minor != 0) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
+    std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
    return 0;
  }

--- a/examples/74_blackwell_gemm_streamk/CMakeLists.txt
+++ b/examples/74_blackwell_gemm_streamk/CMakeLists.txt
@ -29,9 +29,9 @@



-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
-cutlass_example_add_executable(
-  74_blackwell_gemm_streamk
-  blackwell_gemm_streamk.cu
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
+  cutlass_example_add_executable(
+    74_blackwell_gemm_streamk
+    blackwell_gemm_streamk.cu
  )
 endif()
--- a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
+++ b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
@ -556,10 +556,19 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
-    return 0;
+  if (__CUDACC_VER_MAJOR__ < 13) {
+    if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+      std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
+      return 0;
+    } 
  }
+  else {
+    if ((props.major != 10 || props.major != 11) && props.minor != 0) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
+      return 0;
+    }
+  }
+
  //
  // Parse options
  //
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@ -762,9 +762,8 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr
-      << "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
+  if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+    std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
    return 0;
  }

--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@ -138,8 +138,7 @@ using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor

 // Core kernel configurations
 using ArchTag             = cutlass::arch::Sm100;                           // Tag indicating the minimum SM that supports the intended feature
-using EpilogueOperatorClass = cutlass::arch::OpClassTensorOp;               // Epilogue Operator class tag
-using MainloopOperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;    // Mainloop Operator class tag
+using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;            // Operator class tag
 using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size

 // Runtime Cluster Shape
@ -159,7 +158,7 @@ struct MMA2SMConfig {
 };

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    ArchTag, EpilogueOperatorClass,
+    ArchTag, OperatorClass,
    typename MMA1SMConfig::MmaTileShape, ClusterShape,
    Shape<_128,_64>,
    ElementAccumulator, ElementAccumulator,
@ -169,7 +168,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
    // , FusionOperation  // Enable for SF Output
 >::CollectiveOp;
 using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-  ArchTag, MainloopOperatorClass,
+  ArchTag, OperatorClass,
  ElementA, LayoutA *, AlignmentA,
  ElementB, LayoutB *, AlignmentB,
  ElementAccumulator,
@ -187,7 +186,7 @@ using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 using Gemm = Gemm1SM;

 using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::CollectiveBuilder<
-    ArchTag, EpilogueOperatorClass,
+    ArchTag, OperatorClass,
    typename MMA2SMConfig::MmaTileShape, ClusterShape,
    Shape<_128,_64>,
    ElementAccumulator, ElementAccumulator,
@ -197,13 +196,13 @@ using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::Collective
    // , FusionOperation  // Enable for SF Output
 >::CollectiveOp;
 using CollectiveMainloop2SM = typename cutlass::gemm::collective::CollectiveBuilder<
-  ArchTag, MainloopOperatorClass,
+  ArchTag, OperatorClass,
  ElementA, LayoutA *, AlignmentA,
  ElementB, LayoutB *, AlignmentB,
  ElementAccumulator,
    typename MMA2SMConfig::MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      static_cast<int>(sizeof(typename CollectiveEpilogue2SM::SharedStorage))>,
    typename MMA2SMConfig::KernelSchedule
 >::CollectiveOp;
 using GemmKernel2SM = cutlass::gemm::kernel::GemmUniversal<
@ -233,7 +232,7 @@ using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
 std::vector<StrideA> stride_A_host;
 std::vector<StrideB> stride_B_host;
 std::vector<LayoutSFA> layout_SFA_host;
-std::vector<LayoutSFA> layout_SFB_host;
+std::vector<LayoutSFB> layout_SFB_host;
 std::vector<StrideC> stride_C_host;
 std::vector<StrideD> stride_D_host;

@ -897,9 +896,8 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (!(props.major == 10 && props.minor == 0)) {
-    std::cerr
-      << "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
+  if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
+    std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
    return 0;
  }

--- a/examples/75_blackwell_grouped_gemm/CMakeLists.txt
+++ b/examples/75_blackwell_grouped_gemm/CMakeLists.txt
@ -49,7 +49,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
 set(TEST_RANDOM_PERF --iterations=10)                                               # Random problem sizes
 set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10)                       # Random problem sizes

-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+if(CUTLASS_NVCC_ARCHS STREQUAL "100a")
 cutlass_example_add_executable(
  75_blackwell_grouped_gemm
  75_blackwell_grouped_gemm.cu
--- a/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu
@ -504,10 +504,19 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);

-  if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
-    return 0;
+  if (__CUDACC_VER_MAJOR__ < 13) {
+    if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
+      return 0;
+    }
  }
+  else {
+    if ((props.major != 10 || props.major != 11) && props.minor != 0) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
+      return 0;
+    }
+  }
+
  //
  // Parse options
  //
--- a/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu
@ -504,10 +504,19 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
  
-  if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
-    return 0;
+  if (__CUDACC_VER_MAJOR__ < 13) {
+    if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
+      return 0;
+    }
  }
+  else {
+    if ((props.major != 10 || props.major != 11) && props.minor != 0) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
+      return 0;
+    }
+  }
+  
  //
  // Parse options
  //
--- a/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu
@ -500,10 +500,19 @@ int main(int argc, char const **args) {
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
  
-  if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
-    return 0;
-  } 
+  if (__CUDACC_VER_MAJOR__ < 13) {
+    if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
+      return 0;
+    } 
+  }
+  else {
+    if ((props.major != 10 || props.major != 11) && props.minor != 0) {
+      std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
+      return 0;
+    }
+  }
+
  //
  // Parse options
  //
--- a/examples/77_blackwell_fmha/CMakeLists.txt
+++ b/examples/77_blackwell_fmha/CMakeLists.txt
@ -163,7 +163,7 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
        77_blackwell_mla.cu
        TEST_COMMAND_OPTIONS
        TEST_MLA_BASIC
-	    TEST_MLA_SEP_REDUCTION
+	TEST_MLA_SEP_REDUCTION
        TEST_MLA_FUSE_REDUCTION
        )
    target_include_directories(77_blackwell_mla_2sm_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
@ -175,8 +175,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
        77_blackwell_mla.cu
        TEST_COMMAND_OPTIONS
        TEST_MLA_BASIC
-	    TEST_MLA_SEP_REDUCTION
-	    TEST_MLA_FUSE_REDUCTION
+	TEST_MLA_SEP_REDUCTION
+	TEST_MLA_FUSE_REDUCTION
        )
    target_include_directories(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
    target_compile_definitions(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${PREC_MACRO} CPASYNC)
--- a/examples/77_blackwell_fmha/device/fmha_device_bwd.hpp
+++ b/examples/77_blackwell_fmha/device/fmha_device_bwd.hpp
@ -100,7 +100,7 @@ public:
    cutlass::fmha::kernel::FmhaKernelBwdConvert<ProblemShape, Element, ElementAccumulator>
  >;

-  using OperationMha= cutlass::fmha::device::FMHA<
+  using OperationNormal= cutlass::fmha::device::FMHA<
      cutlass::fmha::kernel::Sm100FmhaBwdKernelTmaWarpSpecialized<
          ProblemShape, Element, ElementAccumulator, TileShape, Mask
      >
@ -112,7 +112,7 @@ public:
      >
  >;

-  using Operation = std::conditional_t<IsMla, OperationMla, OperationMha>;
+  using Operation = std::conditional_t<IsMla, OperationMla, OperationNormal>;

  using Kernel = typename Operation::Kernel;

--- a/examples/77_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp
+++ b/examples/77_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp
@ -365,7 +365,7 @@ struct Sm100FmhaGenKernelWarpspecialized {
      pipeline_corr_epi_params.role = CollectiveMainloop::PipelineE::ThreadCategory::Consumer;
    }
    pipeline_corr_epi_params.producer_arv_count = NumWarpsCorrection * cutlass::NumThreadsPerWarp;
-    pipeline_corr_epi_params.consumer_arv_count = NumWarpsEpilogue * cutlass::NumThreadsPerWarp;
+    pipeline_corr_epi_params.consumer_arv_count = cute::max(1, NumWarpsEpilogue * cutlass::NumThreadsPerWarp);
    typename CollectiveMainloop::PipelineE pipeline_corr_epi(
      shared_storage.pipelines.corr_epi,
      pipeline_corr_epi_params,
--- a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
+++ b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
@ -117,7 +117,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
    ElementC, LayoutC, AlignmentC,
-    cutlass::epilogue::NoSmemWarpSpecialized2Sm
+    cutlass::epilogue::FastF32NoSmemWarpSpecialized2Sm
  >::CollectiveOp;

 // Build the mainloop
--- a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
@ -88,7 +88,7 @@

 using namespace cute;

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)


 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
 cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
 // Reference Output Tensor
 cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 template <typename T>
 auto make_iterator(T* ptr) {
@ -283,7 +283,7 @@ struct Result

 };

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
@ -489,19 +489,28 @@ int run(Options &options)
  return 0;
 }

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
-  // and must have compute capability at least 100.
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
-    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif

  cudaDeviceProp props;
  int current_device_id;
@ -509,8 +518,8 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 12 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
    return 0;
  }

@ -530,9 +539,9 @@ int main(int argc, char const **args) {
  //
  // Evaluate CUTLASS kernels
  //
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  run<Gemm>(options);
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

  return 0;
 }
--- a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
@ -86,7 +86,7 @@

 using namespace cute;

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)


 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -217,7 +217,7 @@ cutlass::HostTensor<ElementSFD, cutlass::layout::PackedVectorLayout> block_refer
 // Matrix-wide normalization constant
 cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 template <typename T>
 auto make_iterator(T* ptr) {
@ -311,7 +311,7 @@ struct Result

 };

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
@ -536,19 +536,28 @@ int run(Options &options)
  return 0;
 }

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
-  // and must have compute capability at least 100.
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
-    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif

  cudaDeviceProp props;
  int current_device_id;
@ -556,8 +565,8 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 12 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
    return 0;
  }

@ -577,9 +586,9 @@ int main(int argc, char const **args) {
  //
  // Evaluate CUTLASS kernels
  //
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  run<Gemm>(options);
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

  return 0;
 }
--- a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
@ -88,7 +88,7 @@

 using namespace cute;

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)


 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
 cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
 // Reference Output Tensor
 cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 template <typename T>
 auto make_iterator(T* ptr) {
@ -283,7 +283,7 @@ struct Result

 };

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
@ -489,19 +489,28 @@ int run(Options &options)
  return 0;
 }

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
-  // and must have compute capability at least 100.
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
-    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif

  cudaDeviceProp props;
  int current_device_id;
@ -509,8 +518,8 @@ int main(int argc, char const **args) {

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 12 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
    return 0;
  }

@ -530,9 +539,9 @@ int main(int argc, char const **args) {
  //
  // Evaluate CUTLASS kernels
  //
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  run<Gemm>(options);
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

  return 0;
 }
--- a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
@ -97,7 +97,7 @@ using namespace cute;
 using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
 using ElementInput = cutlass::float_e2m1_t;                                // Element type for Input matrix operands

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM kernel configurations
 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -263,7 +263,7 @@ cutlass::DeviceAllocation<ElementAccumulator> block_beta;
 // NormConst is a single device-side constant value, its not per-batch or per-group
 cutlass::DeviceAllocation<ElementAccumulator> norm_constant_device;

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 template <typename T>
 auto make_iterator(T* ptr) {
@ -466,7 +466,7 @@ struct Result
  bool passed = false;
 };

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
@ -861,30 +861,39 @@ int run(Options &options, bool host_problem_shapes_available = true)
  return 0;
 }

-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 ||
       ((__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)
       )
     ) {
-    std::cerr << "This example requires CUDA 12.8 or newer.\n";
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support.\n";
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif

  cudaDeviceProp props;
  int current_device_id;
  CUDA_CHECK(cudaGetDevice(&current_device_id));
  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (!(props.major == 12 && props.minor == 0)) {
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
    std::cerr
-      << "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120a).\n";
+      << "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120 or 121).\n";
    return 0;
  }

@ -901,7 +910,7 @@ int main(int argc, char const **args) {
    return 0;
  }

-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  allocate(options);
  initialize(options);

--- a/examples/79_blackwell_geforce_gemm/CMakeLists.txt
+++ b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
@ -46,7 +46,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
 set(TEST_RANDOM_PERF --iterations=10)                                               # Random problem sizes
 set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10)                       # Random problem sizes

-if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
 cutlass_example_add_executable(
  79a_blackwell_geforce_nvfp4_bf16_gemm
  79a_blackwell_geforce_nvfp4_bf16_gemm.cu
--- a/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
+++ b/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
@ -78,7 +78,7 @@

 #include "helper.h"
 using namespace cute;
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM kernel configurations
 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -248,7 +248,7 @@ struct Result
    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
  {}
 };
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -507,25 +507,34 @@ int run(Options &options)
  }
  return 0;
 }
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
-  // and must have compute capability at least 120.
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
-    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif
  cudaDeviceProp props;
  int current_device_id;
  CUDA_CHECK(cudaGetDevice(&current_device_id));

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 12 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
    return 0;
  }
  //
@ -540,9 +549,9 @@ int main(int argc, char const **args) {
  //
  // Evaluate CUTLASS kernels
  //
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  run<Gemm>(options);
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  return 0;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu
+++ b/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu
@ -78,7 +78,7 @@

 #include "helper.h"
 using namespace cute;
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM kernel configurations
 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -183,7 +183,7 @@ cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> bloc
 cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
 cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> block_reference_SFD;
 cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 template <typename T>
 auto make_iterator(T* ptr) {
  return cute::recast_ptr<T>(ptr);
@ -259,7 +259,7 @@ struct Result
    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
  {}
 };
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -531,25 +531,34 @@ int run(Options &options)
  }
  return 0;
 }
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char const **args) {

-  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
-  // and must have compute capability at least 120.
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
-    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif
  cudaDeviceProp props;
  int current_device_id;
  CUDA_CHECK(cudaGetDevice(&current_device_id));

  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));

-  if (!(props.major == 12 && props.minor == 0)) {
-    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
    return 0;
  }
  //
@ -564,9 +573,9 @@ int main(int argc, char const **args) {
  //
  // Evaluate CUTLASS kernels
  //
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  run<Gemm>(options);
-#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
  return 0;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/80_blackwell_geforce_sparse_gemm/CMakeLists.txt
+++ b/examples/80_blackwell_geforce_sparse_gemm/CMakeLists.txt
@ -27,7 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


-if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
 cutlass_example_add_executable(
  80a_blackwell_geforce_mxfp8_bf16_sparse_gemm
  80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
--- a/examples/81_blackwell_gemm_blockwise/README.md
+++ b/examples/81_blackwell_gemm_blockwise/README.md
@ -0,0 +1,104 @@
+# Blockwise and Groupwise GEMM and Grouped GEMM on Blackwell
+
+Blockwise and Groupwise GEMM and Grouped GEMM implement software scaling by the accumulator type.
+The examples in this directory aim to demonstrate how we can instantiate this kernel and run it.
+The profiler enables instantiating and profiling different kernel configurations for Blockwise and Groupwise GEMM
+to determine the best performing kernel for your workload.
+
+## Introduction
+Blockwise and Groupwise GEMM operations enable fine-grained numerical precision control by applying scale factors at configurable granularities. This is particularly useful for quantized neural networks where different regions of tensors may have different scaling requirements.
+
+For a GEMM $D = \alpha A B + \beta C$, we introduce two scale factor tensors, SFA
+and SFB. This leads to a GEMM $D = \alpha \text{SFA} * A \text{ SFB} * B + \beta C$.
+
+## Scale Factor Tensors
+- *SFA*: Broadcast the same scale within a block defined by _scale granularity m_ and _scale granularity k_ when scaling A.
+  - Scale granularity m and scale granularity k are also referred to as _scale vector m_ and _k_ respectively.
+- *SFB*: Broadcast the same scale within a block defined by _scale granularity n_ and _scale granularity k_ when scaling B.
+  - Scale granularity n and scale granularity k are also referred to as _scale vector n_ and _k_ respectively.
+
+These can be represented in CuTe as:
+- *SFA Layout*: $((\text{scale granularity M}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
+- *SFB Layout*: $((\text{scale granularity N}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
+
+The 0 element stride ensures the same group of coordinates to map to the same element in the scale factors.
+
+## Configuration
+
+For convenience the Blockwise and Groupwise implementation provide 
+`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>`
+to deduce layouts and manage compact tensors. 
+
+`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>` by default makes
+every tensor major the M/N mode, but can be configured. For example:
+`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, UMMA::Major::K, UMMA::Major::MN>`
+denotes SFA will be major in the K dimension but SFB will be major in the N dimension.
+
+## Integration with Other Frameworks
+
+If translating from frameworks like Torch where SFA has shape 
+(M / ScaleGranularityM, K / ScaleGranularityK) and SFB has a shape (K / ScaleGranularityK, N / ScaleGranularityN),
+ensure to transpose SFB and B to fit into the canonical CuTe layout form. This ensures K is always the second mode.
+Use strides can be used to determine if each tensor is MN or K major to correctly form the layouts either directly
+or with the convenience wrappers.
+
+
+## Kernel Selection and Profiling 
+
+To determine the most performance Blockwise/Groupwise GEMM or Grouped GEMM kernel for your use case, you can utilize the
+[CUTLASS profiler](../../media/docs/cpp/profiler.md).
+
+All Blockwise/Groupwise GEMMs and Group GEMMs with `f32` scaling of `e4m3` or runtime `f8` types can be selected by 
+selecting a subset of kernels when configuring with CMake by passing:
+`-DCUTLASS_LIBRARY_KERNELS="cutlass3x*f32xe4m3_*f32xe4m3*,cutlass3x*f32xf8_*f32xf8*"`.
+
+The simplest way to use the profiler is to pass `m`, `n`, and `k` as well as your `scale_vec_size_m`, 
+`scale_vec_size_n`, and `scale_vec_size_k`. Passing `enable-best-kernel-for-fixed-shape` will do some autotuning
+per kernel to determine best rasterization orders, swizzles, and cluster sizes. Passing `blockwiseGemm`
+or `GroupedGemm` through the operation flag will determine which set of operations will be profiled.
+
+For examle, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale
+granularity m = 1, scale granularity n = 128, and scale granularity k = 128 for the problem size 8192x8192x8192:
+```
+cutlass_profiler --operation=blockwiseGemm \
+                 --enable-best-kernel-for-fixed-shape \
+                 --m=8192 --n=8192 --k=8192 \
+                 --scale_vec_size_m=1 --scale_vec_size_n=128 --scale_vec_size_k=128 \
+                 --verification-enabled=false
+```
+
+### Kernel Naming Convention
+
+The naming of the blockwise and groupwise kernels includes the following new pattern: for each tensor scalar pair we have
+`<scale_granularity_m or scale_granularity_n>x<scale_granularity_k><accumulator type>x<scaled tensor type>`. For example
+`cutlass3x_sm100_tensorop_gemm_64x128f32xe4m3_1x128f32xe4m3_f32_f16_f16_64x128x128_1x1x1_0_nnn_align16_1sm` would denote:
+- A CUTLASS 3 GEMM for SM100 that uses tensor cores.
+- SFA is f32 with a 64 element scale granularity m and a 128 element scale granularity k.
+- The A matrix is e4m3.
+- SFB is f32 with a 1 element scale granularity n and a 128 element scale granularity k.
+- The B matrix is e4m3. 
+- The epilogue is done in f32. 
+- The C matrix is f16.
+- The D matrix is f16.
+- The MMA tile shape is 64x128x128. 
+- The cluster shape is 1x1x1. 
+- A, B, C, and D are all column major. 
+- The alignment of the major modes are 16 elements for A, B, C, and D. 
+- The MMA variant is a 1SM instruction.
+
+It is also worthwhile to note that C can be void if scaling by beta is not needed.
+
+## Performance Tips and Tricks
+
+- *MMA Dimensions*: in both Blackwell and Hopper tensor cores it is worthwhile to note that the smallest `MMA_M` dimension is 64, but `MMA_N`
+dimension can be as small as 8 for some instructions. For problem sizes where M is small consider computing $D^T = \alpha B^T A^T + \beta C^T$ instead.
+  - When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unecessary computation.
+- *Layout Swapping*: When optimizing with the profiler swap `m` and `n` inputs and adjust layouts to reflect this swapping and transposing.
+  - For example if we have a row-major A, column-major B, and row-major D, we can swap tensors and run a kernel with:
+    - The left hand matrix as row-major (since B transposed is row-major)
+    - A right hand matrix as column-major (since A transposed is column-major)
+    - A column-major output (since D transposed is column-major).
+
+When using blockwise and groupwise GEMM we must swap the scale vector sizes when doing this optimization. If we have a 1 element scale granularity M
+and a 128 element scale granularity N, we must run a kernel with a 128 element scale granularity M and a 1 element scale granularity
+N.
--- a/examples/86_blackwell_mixed_dtype_gemm/86_blackwell_mixed_dtype.cu
+++ b/examples/86_blackwell_mixed_dtype_gemm/86_blackwell_mixed_dtype.cu
@ -0,0 +1,495 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/util/mixed_dtype_utils.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+
+#include "helper.h"
+
+#include "mixed_dtype_helper.cuh"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using MmaType = cutlass::bfloat16_t;
+using QuantType = cutlass::int4b_t;
+using AccumulatorType = float;
+
+// A matrix configuration
+using         ElementA    = MmaType;                                                   // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                                 // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;               // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = QuantType;                                                 // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                              // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;               // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input layouts
+using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+using ElementZero = MmaType;
+using ElementScale = MmaType;
+
+// C/D matrix configuration
+using         ElementC    = cutlass::bfloat16_t;                                       // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                                 // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;               // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                                       // Element type for C and D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator  = AccumulatorType;                                            // Element type for internal accumulation
+using ElementCompute      = AccumulatorType;                                            // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm100;                                       // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                             // Operator class tag
+using MmaTileShape        = Shape<_256,_128,_128>;                                       // (MmaTileShape_N, MmaTileShape_M, MmaTileShape_K) as A and B will be swapped
+using ClusterShape        = Shape<_2,_1,_1>;                                            // Shape of the threadblocks in a cluster
+using MainloopSchedule    = cutlass::gemm::KernelTmaWarpSpecialized2SmMixedInputSm100;  // Kernel to launch based on the default setting in the Collective Builder 
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized2Sm;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+constexpr int ScaleGranularityN = 1; //Should be less than or equal to GEMM_N
+constexpr int ScaleGranularityK = 128; //Should be less than or equal to GEMM_K
+using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig<ScaleGranularityN, ScaleGranularityK>;
+using LayoutScale  = decltype(ScaleConfig::deduce_layout_scale()); // Layout type for SFA matrix operand
+LayoutScale layout_S;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    MmaTileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementCompute,
+    // Transpose layout of D here since we use explicit swap + transpose
+    // the void type for C tells the builder to allocate 0 smem for the C matrix.
+    // We can enable this if beta == 0 by changing ElementC to void below.
+    ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
+    ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+    EpilogueSchedule // This is the only epi supporting the required swap + transpose.
+  >::CollectiveOp;
+
+// ============================================================ MIXED INPUT NO SCALES ============================================================================
+ //The collective will infer that the narrow type should be upcasted to the wide type.
+ //We swap A and B operands to the builder here
+using CollectiveMainloopConvertOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    MainloopSchedule
+  >::CollectiveOp;
+
+using GemmKernelConvertOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopConvertOnly,
+    CollectiveEpilogue
+>;
+
+using GemmConvertOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnly>;
+
+// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
+// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
+using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    MainloopSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleOnly,
+    CollectiveEpilogue
+>;
+
+using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
+
+// =========================================================== MIXED INPUT WITH SCALES AND ZEROS ==================================================================
+// We specify scale + zero elements to indicate that we require both. Scales and biases have the same format.
+using CollectiveMainloopScaleWithZeroPoint = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale, ElementZero>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    MainloopSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleWithZeroPoint = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleWithZeroPoint,
+    CollectiveEpilogue
+>;
+
+using GemmScaleWithZeroPoint = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPoint>;
+// =================================================================================================================================================================
+
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+
+using StrideC = typename GemmKernelScaleOnly::StrideC;
+using StrideD = typename GemmKernelScaleOnly::StrideD;
+using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideC_ref stride_C_ref;
+StrideD stride_D;
+StrideD_ref stride_D_ref;
+uint64_t seed;
+
+// Scale and Zero share a stride since the layout and shapes must be the same.
+using StrideS = typename cute::Stride<cute::Int<1>, int64_t, int64_t>;
+using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+StrideS stride_S;
+StrideS_ref stride_S_ref;
+
+cutlass::DeviceAllocation<ElementA> block_A;
+cutlass::DeviceAllocation<ElementB> block_B;
+cutlass::DeviceAllocation<MmaType> block_B_dq;
+cutlass::DeviceAllocation<ElementScale> block_scale;
+cutlass::DeviceAllocation<ElementZero> block_zero;
+cutlass::DeviceAllocation<ElementC> block_C;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(MixedDtypeOptions const& options) {
+
+  auto shape_b = cute::make_shape(options.n, options.k, options.l);
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_b);
+  // Reverse stride here due to swap and transpose
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
+  stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
+  // Reverse stride here due to swap and transpose
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
+  stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
+
+  layout_S = ScaleConfig::tile_atom_to_shape_scale(make_shape(options.n, options.k, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_S)));
+
+  block_A.reset(a_coord.product());
+  block_B.reset(b_coord.product());
+  block_B_dq.reset(b_coord.product());
+  block_C.reset(c_coord.product());
+  block_D.reset(c_coord.product());
+  block_ref_D.reset(c_coord.product());
+
+  block_scale.reset(blockscale_b_coord.product());
+  block_zero.reset(blockscale_b_coord.product());
+
+  initialize_tensor(block_A, seed + 2022);
+  initialize_quant_tensor(block_B, seed + 2021);
+  initialize_tensor(block_C, seed + 2020);
+  initialize_scale<QuantType, ElementScale>(block_scale, options);
+  initialize_zero(block_zero, options);
+
+  if(options.verify){
+    auto layout_B = make_layout(shape_b, stride_B);
+    auto scale_stride = layout_S.stride();
+    auto layout_scale_zero = make_layout(
+    make_shape(size<0>(layout_S), size<1,1>(layout_S), size<2>(layout_S)), 
+      make_stride(size<0,1>(scale_stride), size<1,1>(scale_stride), size<2>(scale_stride))
+    ); //layout = (options.n, scale_k, options.l) : (_1, options.n, _0)
+    cudaStream_t stream = cudaStreamDefault;
+    cutlass::dequantize(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, ScaleGranularityK, stream);
+  }
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <class Args, cutlass::detail::ConversionMode KernelConversionMode>
+Args args_from_options(MixedDtypeOptions const& options)
+{
+// Swap the A and B tensors, as well as problem shapes here.
+  if constexpr (KernelConversionMode == cutlass::detail::ConversionMode::DirectConvert) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScale) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScaleWithZero) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S, block_zero.get()},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } else {
+    exit(-1);
+  }
+}
+
+bool verify(MixedDtypeOptions const& options) {
+  //
+  // Compute reference output
+  //
+
+  using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      MmaType, LayoutA, AlignmentA,
+      MmaType, LayoutB, AlignmentB,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag, cutlass::arch::OpClassTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>, // Indicates ProblemShape
+      CollectiveMainloopRef,
+      CollectiveEpilogueRef
+  >;
+
+  using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+   typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {block_A.get(), stride_A, block_B_dq.get(), stride_B},
+    {{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
+  };
+
+  // Run the gemm where the scaling is performed outside of the kernel.
+  GemmRef gemm_ref;
+  size_t workspace_size = GemmRef::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  CUTLASS_CHECK(gemm_ref.can_implement(arguments));
+  CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(gemm_ref.run());
+
+  // compare_reference
+  ElementD const epsilon(1e-2f);
+  ElementD const non_zero_floor(1e-2f);
+  bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(MixedDtypeOptions &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<typename Gemm::Arguments, Gemm::CollectiveMainloop::KernelConversionMode>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  MixedDtypeResult result;
+  if(options.verify){
+    result.passed = verify(options);
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+  }
+  else{
+    result.passed = true;
+    std::cout << "  Verification: Off " << std::endl;
+  }
+  if (!result.passed) {
+    exit(-1);
+  }
+  mixed_dtype_profiling(gemm, options, result);
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
+  // and must have compute capability at least 100a.
+  bool is_correct_cuda_version = (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 8);
+  if (!is_correct_cuda_version) {
+    std::cerr << "Version is " << __CUDACC_VER_MINOR__ << "\n";
+    std::cerr << "This example requires CUDA 12.8 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major != 10 || props.minor != 0) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Blackwell Architecture or "
+      << "later (compute capability 100a or greater).\n";
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  MixedDtypeOptions options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    std::cout << "Running in conversion only mode." << std::endl;
+    run<GemmConvertOnly>(options);
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
+    std::cout << "Running in scale mode." << std::endl;
+    run<GemmScaleOnly>(options);
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    std::cout << "Running in scale and zero mode." << std::endl;
+    run<GemmScaleWithZeroPoint>(options);
+  }
+  else{
+    std::cerr << "Invalid mode " << options.mode << ". Must be 0, 1 or 2." << std::endl;
+  }
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/86_blackwell_mixed_dtype_gemm/CMakeLists.txt
+++ b/examples/86_blackwell_mixed_dtype_gemm/CMakeLists.txt
@ -0,0 +1,45 @@
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set(TEST_S_TILE_SHAPE --m=256 --n=128 --k=32 --verify --iterations=0)
+set(TEST_S_TILE_SHAPE_MULTIPLE_KITER --m=256 --n=128 --k=128 --verify --iterations=0)
+set(TEST_S_DIFFERENT_MN --m=16384 --n=4608 --k=4608 --verify --iterations=0)                      
+set(TEST_S_ONE_WAVE --m=1536 --n=1536 --k=32 --verify --iterations=0)                 # Assuming 144 SMs
+set(TEST_S_2048 --m=2048 --n=2048 --k=2048 --verify --iterations=0)                   # Multi-wave
+
+if(NOT WIN32)
+  cutlass_example_add_executable(
+    86_blackwell_mixed_dtype_gemm
+    86_blackwell_mixed_dtype.cu
+    TEST_COMMAND_OPTIONS
+    TEST_S_TILE_SHAPE
+    TEST_S_TILE_SHAPE_MULTIPLE_KITER
+    TEST_S_ONE_WAVE
+    TEST_S_2048
+  )
+endif()
--- a/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
+++ b/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cute/tensor.hpp"
+
+#include <cuda.h>
+#include <numeric>
+#include "helper.h"
+
+enum MixedDtypeGemmMode {
+  ConvertOnly,
+  ScaleOnly,
+  ScaleWithZeroPoint
+};
+
+/// Command line options parsing
+struct MixedDtypeOptions {
+
+  bool help = false;
+  bool verify = false;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  int iterations = 1000;
+  int warmup = 1000;
+  int mode = 1;
+  int m = 5120, n = 4096, k = 4096;
+  int l = 1;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("verify")) {
+      verify = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("mode", mode);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("warmup", warmup);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "86_blackwell_mixed_dtype_gemm\n\n"
+      << "  Blackwell Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   The number of independent gemm problems with mnk shape\n"
+      << "  --mode=<int>                The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --warmup=<int>              Number of warmup iterations to perform.\n\n"
+      << "  --verify=<int>              Run verification.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "86_blackwell_mixed_dtype_gemm" << " --m=1024 --n=512 --k=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct MixedDtypeResult
+{
+  double avg_runtime_ms = 0.0;
+  double gflops = 0.0;
+  cutlass::Status status = cutlass::Status::kSuccess;
+  cudaError_t error = cudaSuccess;
+  bool passed = false;
+
+};
+
+/// Profiling Loop
+template <class Gemm>
+void mixed_dtype_profiling(
+  Gemm& gemm,
+  MixedDtypeOptions const& options,
+  MixedDtypeResult& result) {
+
+  if (options.iterations <= 0) return;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  std::vector<float> runtimes;
+  runtimes.reserve(options.iterations);
+
+  for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
+    cudaEventRecord(start);
+    CUTLASS_CHECK(gemm.run());
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    if (iter >= options.warmup) {
+      float milliseconds = 0;
+      cudaEventElapsedTime(&milliseconds, start, stop);
+      runtimes.push_back(milliseconds);
+    }
+  }
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  // Compute average setup and runtime and GFLOPs.
+  result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
+  result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+  std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+  std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  
+}
+
+/// Helpers to initialize a block of device data
+template <class Element>
+bool initialize_tensor(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed = 2023) {
+
+  double scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+  int bits_output = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if (bits_output == 16) {
+    scope_max = 5;
+    scope_min = -5;
+  }
+  else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+  cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+
+  return true;
+}
+
+template <typename Element>
+bool initialize_quant_tensor(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed = 2023) {
+  
+  float scope_min = float(cutlass::platform::numeric_limits<Element>::lowest());
+  float scope_max = float(cutlass::platform::numeric_limits<Element>::max());
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+
+  return true;
+}
+
+template <class QuantType, class Element>
+bool initialize_scale(
+  cutlass::DeviceAllocation<Element>& block, 
+  MixedDtypeOptions const& options,
+  uint64_t seed = 2023) {
+  
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    // No scales, so just initialize with 1 so we can use the same kernel to dequantize the data.
+    std::vector<Element> stage(block.size(), Element(1.0f));
+    block.copy_from_host(stage.data());
+  } 
+  else {
+    float elt_max_f = float(cutlass::platform::numeric_limits<QuantType>::max());
+    const float max_dequant_val = 4.f;
+    const float min_dequant_val = 0.5f;
+
+    float scope_max(max_dequant_val / elt_max_f);
+    float scope_min(min_dequant_val / elt_max_f);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+  }
+  return true;
+}
+
+template <class Element>
+bool initialize_zero(
+  cutlass::DeviceAllocation<Element>& block,
+  MixedDtypeOptions const& options,
+  uint64_t seed = 2023) {
+  
+  if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(2.0f), Element(-2.0f));
+  } else {
+    // No bias, so just initialize with 1 so we can use the same kernel to dequantize the data.
+    std::vector<Element> stage(block.size(), Element(0.0f));
+    block.copy_from_host(stage.data());
+  }
+  return true;
+}
+
--- a/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
@ -0,0 +1,518 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief An FP8 blockwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
+
+    This example demonstrates a simple way to instantiate and run a blockwise scaling FP8 GEMM on the NVIDIA Blackwell SM120 architecture.
+    This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+    This kernel accepts Inputs A and B with TileMxTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
+    Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+
+    1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+    2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+    3. Epilogue Optimization
+
+    Note that GeForce RTX 50 series GPUs do not support:
+    1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+    2. Dynamic datatypes.
+    3. Runtime scaling block size.
+    
+    Usage:
+
+      $ ./examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+
+#include "helper.h"
+
+#include "./utils.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// A matrix configuration
+using ElementA            = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using LayoutA             = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB            = cutlass::float_e4m3_t;                          // Element type for B matrix operand
+using LayoutB             = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementC            = cutlass::bfloat16_t;                            // Element type for C and D matrix operands
+using LayoutC             = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+using ElementD           = ElementC;
+using LayoutD            = LayoutC;
+constexpr int AlignmentD = AlignmentC;
+
+// MMA type
+using ElementAccumulator = float;                                           // Element Accumulator will also be our scale factor type
+using ElementCompute = float;
+
+
+// MMA and Cluster Tile Shapes
+// Shape of the tile
+using MmaTileShape_MNK = Shape<_128,_128,_128>;                          
+// Shape of the threadblocks in a cluster
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+ 
+using ScaleConfig = decltype(cutlass::detail::sm120_trivial_blockwise_scale_config(MmaTileShape_MNK{}));
+
+using LayoutSFA             = decltype(ScaleConfig::deduce_layoutSFA());                     // Layout type for SFA matrix operand
+using LayoutSFB             = decltype(ScaleConfig::deduce_layoutSFB());                     // Layout type for SFB matrix operand
+
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    MmaTileShape_MNK, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutC, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
+    ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape_MNK, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    void>;                // Default to ClusterLaunchControl (CLC) based tile scheduler
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+// Strides just iterate over scalars and have no zeros
+LayoutSFA layout_SFA;
+LayoutSFB layout_SFB;
+// Layouts are tiled to the problem size and the strides have zeros
+uint64_t seed;
+
+cutlass::HostTensor<ElementA          , LayoutA> tensor_A;
+cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
+cutlass::HostTensor<ElementB          , LayoutB> tensor_B;
+cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
+cutlass::HostTensor<ElementC          , LayoutC> tensor_C;
+cutlass::HostTensor<ElementD          , LayoutD> tensor_D;
+cutlass::HostTensor<ElementD          , LayoutD> tensor_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+  bool skip_verification = false;
+
+  float alpha = 1.f, beta = 0.f;
+  int iterations = 1000;
+  int m = 1024, n = 512, k = 1024, l = 1;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("skip-verification")) {
+      skip_verification = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "87a_blackwell_geforce_gemm_blockwise\n\n"
+      << "  Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --skip-verification         Skip verification.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "87a_blackwell_geforce_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result {
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+
+  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+
+  layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
+  layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
+  auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_C.resize(c_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+  tensor_SFA.resize(blockscale_a_coord);
+  tensor_SFB.resize(blockscale_b_coord);
+
+  initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
+  initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
+  initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
+
+  initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
+  initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
+
+  tensor_A.sync_device();
+  tensor_B.sync_device();
+  tensor_C.sync_device();
+  tensor_D.sync_device();
+
+  tensor_SFA.sync_device();
+  tensor_SFB.sync_device();
+
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options) {
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, 
+     tensor_B.device_data(), stride_B,
+     tensor_SFA.device_data(), layout_SFA,
+     tensor_SFB.device_data(), layout_SFB},
+    {
+      {}, // epilogue.thread
+      tensor_C.device_data(), stride_C,
+      tensor_D.device_data(), stride_D
+    }
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha = options.alpha;
+  fusion_args.beta = options.beta;
+
+  return arguments;
+}
+
+bool verify(Options const& options) {
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  auto A = cute::make_tensor(tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto B = cute::make_tensor(tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto C = cute::make_tensor(tensor_C.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
+  auto D = cute::make_tensor(tensor_ref_D.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
+  auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
+
+  using unused_t = decltype(D);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
+
+  cutlass::reference::host::GettEpilogueParams<
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)
+  > epilogue_params;
+
+  epilogue_params.C = C;
+  epilogue_params.D = D;
+  epilogue_params.alpha = options.alpha;
+  epilogue_params.beta = options.beta;
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // compare_reference
+  tensor_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <class Gemm>
+int run(Options &options) {
+  initialize(options);
+
+  
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ 
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  Result result;
+  if (!options.skip_verification) {
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    result.passed = verify(options);
+
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+    if (!result.passed) {
+      exit(-1);
+    }
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
+    return 0;
+  } 
+  
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Run
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
@ -0,0 +1,539 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief An FP8 groupwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
+
+    This example demonstrates a simple way to instantiate and run cooperative and ping-pong groupwise scaling FP8 GEMMs on the NVIDIA Blackwell SM120 architecture.
+    These kernels are optimized for GeForce RTX 50 series GPUs.
+    
+    The blockscaling kernels accept Inputs A and B with 1xTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
+    The ping-pong kernel leverages a smaller tile shape to avoid register spilling for better performance.
+    Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+
+    1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+    2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+    3. Epilogue Optimization
+
+    Note that GeForce RTX 50 series GPUs do not support:
+    1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+    2. Dynamic datatypes.
+    3. Runtime scaling block size.
+ 
+    Usage:
+
+      $ ./examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+
+#include "helper.h"
+
+#include "./utils.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// A matrix configuration
+using ElementA            = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using LayoutA             = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB            = cutlass::float_e4m3_t;                          // Element type for B matrix operand
+using LayoutB             = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementC            = cutlass::bfloat16_t;                          // Element type for C and D matrix operands
+using LayoutC             = cutlass::layout::RowMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+using ElementD           = ElementC;
+using LayoutD            = LayoutC;
+constexpr int AlignmentD = AlignmentC;
+
+// MMA type
+using ElementAccumulator = float;                                           // Element Accumulator will also be our scale factor type
+using ElementCompute = float;
+
+
+// MMA and Cluster Tile Shapes
+// Shape of the tile
+using CooperativeMmaTileShape_MNK = Shape<_128,_128,_128>;                          
+// Smaller tile size for pingpong schedule to avoid register spilling
+using PingpongMmaTileShape_MNK = Shape<_64, _128, _128>;
+// Shape of the threadblocks in a cluster
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+constexpr int ScaleGranularityM = 1;
+constexpr int ScaleGranularityN = 128;
+constexpr int ScaleGranularityK = 128;
+using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+using LayoutSFA             = decltype(ScaleConfig::deduce_layoutSFA());                     // Layout type for SFA matrix operand
+using LayoutSFB             = decltype(ScaleConfig::deduce_layoutSFB());                     // Layout type for SFB matrix operand
+
+
+template <class TileShape>
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutC, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+template <class TileShape, class Schedule>
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
+    ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue<TileShape>::SharedStorage))>,
+    Schedule // cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120
+  >::CollectiveOp;
+
+template <class TileShape, class Schedule>
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop<TileShape, Schedule>,
+    CollectiveEpilogue<TileShape>,
+    void>;                // Default to ClusterLaunchControl (CLC) based tile scheduler
+
+// We are using cooperative kernel schedule by default
+using CooperativeGemm = cutlass::gemm::device::GemmUniversalAdapter<
+    GemmKernel<CooperativeMmaTileShape_MNK, cutlass::gemm::KernelScheduleSm120Blockwise>>;
+
+// Pingpong kernel
+using PingpongGemm = cutlass::gemm::device::GemmUniversalAdapter<
+    GemmKernel<PingpongMmaTileShape_MNK, cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120>>;
+
+using StrideA = typename CooperativeGemm::GemmKernel::StrideA;
+using StrideB = typename CooperativeGemm::GemmKernel::StrideB;
+using StrideC = typename CooperativeGemm::GemmKernel::StrideC;
+using StrideD = typename CooperativeGemm::GemmKernel::StrideD;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+// Strides just iterate over scalars and have no zeros
+LayoutSFA layout_SFA;
+LayoutSFB layout_SFB;
+// Layouts are tiled to the problem size and the strides have zeros
+uint64_t seed;
+
+cutlass::HostTensor<ElementA          , LayoutA> tensor_A;
+cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
+cutlass::HostTensor<ElementB          , LayoutB> tensor_B;
+cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
+cutlass::HostTensor<ElementC          , LayoutC> tensor_C;
+cutlass::HostTensor<ElementD          , LayoutD> tensor_D;
+cutlass::HostTensor<ElementD          , LayoutD> tensor_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+  bool skip_verification = false;
+
+  float alpha = 1.f, beta = 0.f;
+  int iterations = 1000;
+  int m = 1024, n = 512, k = 1024, l = 1;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("skip-verification")) {
+      skip_verification = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "87b_blackwell_geforce_gemm_groupwise\n\n"
+      << "  Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --skip-verification         Skip verification.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "87b_blackwell_geforce_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result {
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+
+  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+
+  layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
+  layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
+  auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_C.resize(c_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+  tensor_SFA.resize(blockscale_a_coord);
+  tensor_SFB.resize(blockscale_b_coord);
+
+  initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
+  initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
+  initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
+
+  initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
+  initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
+
+  tensor_A.sync_device();
+  tensor_B.sync_device();
+  tensor_C.sync_device();
+  tensor_D.sync_device();
+
+  tensor_SFA.sync_device();
+  tensor_SFB.sync_device();
+
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <class Gemm>
+typename Gemm::Arguments args_from_options(const Options &options) {
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, 
+     tensor_B.device_data(), stride_B,
+     tensor_SFA.device_data(), layout_SFA,
+     tensor_SFB.device_data(), layout_SFB},
+    {
+      {}, // epilogue.thread
+      tensor_C.device_data(), stride_C,
+      tensor_D.device_data(), stride_D
+    }
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha = options.alpha;
+  fusion_args.beta = options.beta;
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  auto A = cute::make_tensor(tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto B = cute::make_tensor(tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto C = cute::make_tensor(tensor_C.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
+  auto D = cute::make_tensor(tensor_ref_D.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
+  auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
+
+  using unused_t = decltype(D);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
+
+  cutlass::reference::host::GettEpilogueParams<
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)
+  > epilogue_params;
+
+  epilogue_params.C = C;
+  epilogue_params.D = D;
+  epilogue_params.alpha = options.alpha;
+  epilogue_params.beta = options.beta;
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // compare_reference
+  tensor_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <class Gemm>
+int run(Options &options) {
+  initialize(options);
+
+  
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<Gemm>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ 
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  Result result;
+  if (!options.skip_verification) {
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    result.passed = verify(options);
+
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+    if (!result.passed) {
+      exit(-1);
+    }
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
+    return 0;
+  } 
+  
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Run
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  printf("Running kernel with Cooperative kernel schedule:\n");
+  run<CooperativeGemm>(options);
+
+  printf("Running kernel with Pingpong kernel schedule:\n");
+  run<PingpongGemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
@ -0,0 +1,678 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief An FP8 groupwise scaled grouped GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
+    
+    This example demonstrates an implementation of Grouped GEMM using a TMA + Blackwell SM120 TensorOp-based warp-specialized kernel
+    for FP8 with per-group:1x128x128 FP32 scaling factors.
+    In this example, M, N, and K are fixed across groups. 
+    As RTX 50 series GPUs do not support runtime scaling block sizes, all groups share the same block scaling size.
+    For this example all scheduling work is performed on the device, utilizing the device-side modification of TMA descriptors
+    to move between groups/problem_count (represented by groups).
+    https://docs.nvidia.com/cuda/cuda-c-programming-guide/#encoding-a-tensor-map-on-device
+
+    To run this example:
+
+      $ ./examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise --m=2048 --n=2048 --k=2048 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Same applies for alpha and beta values that are randomized across the different groups.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+
+#include "helper.h"
+
+#include "./utils.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// A matrix configuration
+using ElementA            = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using LayoutA             = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB            = cutlass::float_e4m3_t;                          // Element type for B matrix operand
+using LayoutB             = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementC            = cutlass::bfloat16_t;                          // Element type for C and D matrix operands
+using LayoutC             = cutlass::layout::RowMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+using ElementD           = ElementC;
+using LayoutD            = LayoutC;
+constexpr int AlignmentD = AlignmentC;
+
+// MMA type
+using ElementAccumulator = float;                                           // Element Accumulator will also be our scale factor type
+using ElementCompute = float;
+
+
+// MMA and Cluster Tile Shapes
+// Shape of the tile
+using MmaTileShape_MNK = Shape<_128,_128,_128>;                          
+// Shape of the threadblocks in a cluster
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+// Scaling Factors
+using ElementSF = ElementAccumulator;
+
+constexpr int ScaleGranularityM = 1;
+constexpr int ScaleGranularityN = 128;
+constexpr int ScaleGranularityK = 128;
+using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+using LayoutSFA             = decltype(ScaleConfig::deduce_layoutSFA());                     // Layout type for SFA matrix operand
+using LayoutSFB             = decltype(ScaleConfig::deduce_layoutSFB());                     // Layout type for SFB matrix operand
+
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    MmaTileShape_MNK, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+    ElementA, cute::tuple<LayoutA *, LayoutSFA *>, AlignmentA,
+    ElementB, cute::tuple<LayoutB *, LayoutSFB *>, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape_MNK, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::KernelScheduleSm120Blockwise
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    void>;                // Default to ClusterLaunchControl (CLC) based tile scheduler
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA, LayoutSFA>);
+static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB, LayoutSFB>);
+
+
+/// Initialization
+uint64_t seed;
+
+std::vector<StrideA> stride_A_host;
+std::vector<StrideB> stride_B_host;
+std::vector<StrideC> stride_C_host;
+std::vector<StrideD> stride_D_host;
+std::vector<LayoutSFA> layout_SFA_host;
+std::vector<LayoutSFB> layout_SFB_host;
+
+std::vector<ElementAccumulator> alpha_host;
+std::vector<ElementAccumulator> beta_host;
+
+using HostTensorA = cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout>;
+using HostTensorB = cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout>;
+using HostTensorC = cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout>;
+using HostTensorD = cutlass::HostTensor<Gemm::EpilogueOutputOp::ElementOutput, cutlass::layout::PackedVectorLayout>;
+using HostTensorSFA = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
+using HostTensorSFB = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
+
+std::vector<HostTensorA> block_A;
+std::vector<HostTensorB> block_B;
+std::vector<HostTensorC> block_C;
+std::vector<HostTensorD> block_D;
+std::vector<HostTensorD> block_ref_D;
+std::vector<HostTensorSFA> block_SFA;
+std::vector<HostTensorSFB> block_SFB;
+
+cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
+cutlass::DeviceAllocation<ElementA const*> ptr_A;
+cutlass::DeviceAllocation<ElementB const*> ptr_B;
+cutlass::DeviceAllocation<ElementSF const*> ptr_SFA;
+cutlass::DeviceAllocation<ElementSF const*> ptr_SFB;
+cutlass::DeviceAllocation<ElementC const*> ptr_C;
+cutlass::DeviceAllocation<ElementD *> ptr_D;
+cutlass::DeviceAllocation<ElementD *> ptr_ref_D;
+
+cutlass::DeviceAllocation<StrideA> stride_A;
+cutlass::DeviceAllocation<StrideB> stride_B;
+cutlass::DeviceAllocation<StrideC> stride_C;
+cutlass::DeviceAllocation<StrideD> stride_D;
+cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
+cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
+
+cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
+cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
+cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
+cutlass::DeviceAllocation<ElementAccumulator> block_beta;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)  
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+  bool skip_verification = false;
+
+  float alpha = 1.f, beta = 0.f;
+  int iterations = 1000;
+  int m = 1024, n = 512, k = 1024, l = 1, groups = 10;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
+  RasterOrderOptions raster_order = RasterOrderOptions::AlongN;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("skip-verification")) {
+      skip_verification = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("groups", groups);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+    char raster_char;
+    cmd.get_cmd_line_argument("raster", raster_char, 'N');
+
+    if (raster_char == 'N' || raster_char == 'n') {
+      raster_order = RasterOrderOptions::AlongN;
+    } else if (raster_char == 'M' || raster_char == 'm') {
+      raster_order = RasterOrderOptions::AlongM;
+    }
+
+    for (int i = 0; i < groups; ++i) {
+      problem_sizes_host.push_back({m, n, k});
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "87c_blackwell_geforce_grouped_gemm_groupwise\n\n"
+      << "  Blackwell FP8 GEMM with Groupwise Scaling using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --groups=<int>              Sets the number of individual GEMM problems for Grouped GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --skip-verification         Skip verification.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "87c_blackwell_geforce_grouped_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --groups=8 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * groups;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result {
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+
+  std::vector<ElementA *> ptr_A_host(options.groups);
+  std::vector<ElementB *> ptr_B_host(options.groups);
+  std::vector<ElementSF *> ptr_SFA_host(options.groups);
+  std::vector<ElementSF *> ptr_SFB_host(options.groups);
+  std::vector<ElementC *> ptr_C_host(options.groups);
+  std::vector<ElementD *> ptr_D_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
+
+  block_alpha.reset(options.groups);
+  block_beta.reset(options.groups);
+  for (int i = 0; i < options.groups; ++i) {
+    auto problem = options.problem_sizes_host.at(i);
+    auto [M, N, K] = problem;
+    
+    auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+    auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+    auto stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
+    auto stride_D = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
+
+    auto layout_A = make_layout(make_shape(M, K, 1), stride_A);
+    auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
+    auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
+    auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
+
+    auto layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
+    auto layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
+
+    stride_A_host.push_back(stride_A);
+    stride_B_host.push_back(stride_B);
+    layout_SFA_host.push_back(layout_SFA);
+    layout_SFB_host.push_back(layout_SFB);
+    stride_C_host.push_back(stride_C);
+    stride_D_host.push_back(stride_D);
+
+    block_A.push_back(HostTensorA(cutlass::make_Coord(size(layout_A))));
+    block_B.push_back(HostTensorB(cutlass::make_Coord(size(layout_B))));
+    block_C.push_back(HostTensorC(cutlass::make_Coord(size(layout_C))));
+    block_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
+    block_SFA.push_back(HostTensorSFA(cutlass::make_Coord(size(filter_zeros(layout_SFA)))));
+    block_SFB.push_back(HostTensorSFB(cutlass::make_Coord(size(filter_zeros(layout_SFB)))));
+    block_ref_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
+  }
+
+  for (int i = 0; i < options.groups; ++i) {
+    initialize_tensor(block_A.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2022);
+    initialize_tensor(block_B.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2023);
+    initialize_tensor(block_C.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2024);
+    initialize_tensor(block_SFA.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2025);
+    initialize_tensor(block_SFB.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2026);
+
+    block_A.at(i).sync_device();
+    block_B.at(i).sync_device();
+    block_C.at(i).sync_device();
+    block_SFA.at(i).sync_device();
+    block_SFB.at(i).sync_device();
+
+    ptr_A_host.at(i) = block_A.at(i).device_data();
+    ptr_B_host.at(i) = block_B.at(i).device_data();
+    ptr_C_host.at(i) = block_C.at(i).device_data();
+    ptr_D_host.at(i) = block_D.at(i).device_data();
+    ptr_SFA_host.at(i) = block_SFA.at(i).device_data();
+    ptr_SFB_host.at(i) = block_SFB.at(i).device_data();
+
+    alpha_host.push_back((options.alpha == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
+    beta_host.push_back((options.beta == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
+    ptr_alpha_host.at(i) = block_alpha.get() + i;
+    ptr_beta_host.at(i) = block_beta.get() + i;
+  }
+
+  problem_sizes.reset(options.groups);
+  problem_sizes.copy_from_host(options.problem_sizes_host.data());
+
+  ptr_A.reset(options.groups);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  ptr_B.reset(options.groups);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  ptr_SFA.reset(options.groups);
+  ptr_SFA.copy_from_host(ptr_SFA_host.data());
+
+  ptr_SFB.reset(options.groups);
+  ptr_SFB.copy_from_host(ptr_SFB_host.data());
+
+  ptr_C.reset(options.groups);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  ptr_D.reset(options.groups);
+  ptr_D.copy_from_host(ptr_D_host.data());
+
+  stride_A.reset(options.groups);
+  stride_A.copy_from_host(stride_A_host.data());
+
+  stride_B.reset(options.groups);
+  stride_B.copy_from_host(stride_B_host.data());
+
+  layout_SFA.reset(options.groups);
+  layout_SFA.copy_from_host(layout_SFA_host.data());
+
+  layout_SFB.reset(options.groups);
+  layout_SFB.copy_from_host(layout_SFB_host.data());
+
+  stride_C.reset(options.groups);
+  stride_C.copy_from_host(stride_C_host.data());
+
+  stride_D.reset(options.groups);
+  stride_D.copy_from_host(stride_D_host.data());
+
+  alpha_device.reset(options.groups);
+  alpha_device.copy_from_host(ptr_alpha_host.data());
+  beta_device.reset(options.groups);
+  beta_device.copy_from_host(ptr_beta_host.data());
+
+  block_alpha.copy_from_host(alpha_host.data());
+  block_beta.copy_from_host(beta_host.data());
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = options.raster_order;
+
+  typename Gemm::Arguments arguments;
+  decltype(arguments.epilogue.thread) fusion_args;
+  fusion_args.alpha_ptr = nullptr;
+  fusion_args.beta_ptr = nullptr;
+
+  if (options.alpha != std::numeric_limits<float>::max()) {
+    fusion_args.alpha = options.alpha;
+    fusion_args.alpha_ptr_array = nullptr;
+    fusion_args.dAlpha = {_0{}, _0{}, 0};
+  } else {
+    fusion_args.alpha = 0;
+    fusion_args.alpha_ptr_array = alpha_device.get();
+    fusion_args.dAlpha = {_0{}, _0{}, 1};
+  }
+
+  if (options.beta != std::numeric_limits<float>::max()) {
+    fusion_args.beta = options.beta;
+    fusion_args.beta_ptr_array = nullptr;
+    fusion_args.dBeta = {_0{}, _0{}, 0};
+  } else {
+    fusion_args.beta = 0;
+    fusion_args.beta_ptr_array = beta_device.get();
+    fusion_args.dBeta = {_0{}, _0{}, 1};
+  }
+
+  arguments = {
+    cutlass::gemm::GemmUniversalMode::kGrouped,
+    {options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
+    {ptr_A.get(), stride_A.get(), 
+     ptr_B.get(), stride_B.get(),
+     ptr_SFA.get(), layout_SFA.get(),
+     ptr_SFB.get(), layout_SFB.get()},
+    {
+      fusion_args,
+      ptr_C.get(), stride_C.get(),
+      ptr_D.get(), stride_D.get()
+    },
+    hw_info, scheduler
+  };
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  //
+  // Compute reference output
+  //
+
+  bool passed = true;
+
+  for (int i = 0; i < options.groups; ++i) {
+    auto problem = options.problem_sizes_host.at(i);
+    auto [M, N, K] = problem;
+
+    auto A = cute::make_tensor(block_A.at(i).host_data(),
+        cute::make_layout(cute::make_shape(M, K, 1), stride_A_host.at(i)));
+    auto B = cute::make_tensor(block_B.at(i).host_data(),
+        cute::make_layout(cute::make_shape(N, K, 1), stride_B_host.at(i)));
+    auto C = cute::make_tensor(block_C.at(i).host_data(),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_C_host.at(i)));
+    auto D = cute::make_tensor(block_ref_D.at(i).host_data(),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_D_host.at(i)));
+    auto SFA = cute::make_tensor(block_SFA.at(i).host_data(), layout_SFA_host.at(i));
+    auto SFB = cute::make_tensor(block_SFB.at(i).host_data(), layout_SFB_host.at(i));
+
+    cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
+
+    cutlass::reference::host::GettEpilogueParams<
+        ElementAccumulator,
+        ElementAccumulator,
+        ElementAccumulator,
+        ElementCompute,
+        decltype(C),
+        decltype(D)
+    > epilogue_params;
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha_host.at(i);
+    epilogue_params.beta = beta_host.at(i);
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+
+    block_D.at(i).sync_host();
+    passed &= cutlass::reference::host::TensorEquals(block_ref_D.at(i).host_view(), block_D.at(i).host_view());
+  }
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <class Gemm>
+int run(Options &options) {
+  
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ 
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  Result result;
+  if (!options.skip_verification) {
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    result.passed = verify(options);
+
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+    if (!result.passed) {
+      exit(-1);
+    }
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << " " << options.groups << " Groups" << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
+  // or CUDA 12.9 or higher for SM121 support.
+  // Must have compute capability at least 120.
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+#endif
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
+    std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
+    return 0;
+  } 
+  
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Run
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+  initialize(options);
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/examples/87_blackwell_geforce_gemm_blockwise/CMakeLists.txt
+++ b/examples/87_blackwell_geforce_gemm_blockwise/CMakeLists.txt
@ -0,0 +1,47 @@
+
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
+cutlass_example_add_executable(
+  87a_blackwell_geforce_fp8_bf16_gemm_blockwise
+  87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
+)
+
+cutlass_example_add_executable(
+  87b_blackwell_geforce_fp8_bf16_gemm_groupwise
+  87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
+)
+
+cutlass_example_add_executable(
+  87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise
+  87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
+)
+
+endif()
--- a/examples/87_blackwell_geforce_gemm_blockwise/utils.h
+++ b/examples/87_blackwell_geforce_gemm_blockwise/utils.h
@ -0,0 +1,83 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/// Helper to initialize a block of device data
+template <class Element, class Layout>
+bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+  if (dist_kind == cutlass::Distribution::Uniform) {
+
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    } 
+    else if (bits_input <= 8) {
+      scope_max = 2;
+      scope_min = -2;
+    } 
+    else if (bits_input == 16) {
+      scope_max = 5;
+      scope_min = -5;
+    } 
+    else {
+      scope_max = 8;
+      scope_min = -8;
+    }
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Identity) {
+
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+  else {
+    throw std::runtime_error("Not implementated.");
+  }
+
+  return true;
+}
--- a/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
+++ b/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM103 architecture.
+
+    This example demonstrates a simple way to instantiate and run a blockscaled 3xFP4 GEMM on the NVIDIA Blackwell SM103 architecture.
+
+    Usage:
+
+      $ ./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include <iostream>
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::float_e2m1_t;                          // Element type for A matrix operand
+using         ElementSFA  = cutlass::float_ue4m3_t;
+using         LayoutATag  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 32;                                             // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::float_e2m1_t;                          // Element type for A matrix operand
+using         ElementSFB  = cutlass::float_ue4m3_t;
+using         LayoutBTag  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 32;                                             // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                            // Element type for D matrix operand
+using         ElementC    = cutlass::bfloat16_t;                            // Element type for C matrix operand
+using         LayoutCTag  = cutlass::layout::RowMajor;                      // Layout type for C matrix operand
+using         LayoutDTag  = cutlass::layout::RowMajor;                      // Layout type for D matrix operand
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm103;                           // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Operator class tag
+
+// using ElementD = cutlass::float_e2m1_t; // Enable for SF Output          // Element type for D matrix operands
+
+// Kernel Perf config
+using MmaTileShape        = cute::Shape<cute::_128, cute::_128, Int<768>>;  // MMA's tile size
+using ClusterShape        = cute::Shape<cute::_2, cute::_4, cute::_1>;      // Shape of the threadblocks in a cluster
+
+// Epilogue fusion operator
+using EpilogueFusionOp = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    MmaTileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutCTag, AlignmentC,
+    ElementD, LayoutDTag, AlignmentD,
+    cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+    EpilogueFusionOp
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementA,ElementSFA>, LayoutATag, AlignmentA,
+    cute::tuple<ElementB,ElementSFB>, LayoutBTag, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103      // Kernel schedule policy. Auto or using targeted scheduling policy
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,                                                   // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference device GEMM implementation type
+using StrideA   = typename Gemm::GemmKernel::StrideA;
+using LayoutA   = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB   = typename Gemm::GemmKernel::StrideB;
+using LayoutB   = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC   = typename Gemm::GemmKernel::StrideC;
+using LayoutC   = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD   = typename Gemm::GemmKernel::StrideD;
+using LayoutD   = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout> block_A;
+cutlass::HostTensor<ElementSFA, cutlass::layout::PackedVectorLayout> block_SFA;
+cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout> block_B;
+cutlass::HostTensor<ElementSFB, cutlass::layout::PackedVectorLayout> block_SFB;
+cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
+// Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
+// Reference Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
+#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  return cute::recast_ptr<T>(ptr);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k;
+  int swizzle = 0;
+
+  Options():
+    help(false),
+    m(1024), n(1024), k(1024),
+    alpha(1.f), beta(0.f),
+    iterations(10),
+    swizzle(0)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "89_sm103_fp4_ultra_gemm\n\n"
+      << "  Sm103 3xFP4 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ " << "./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_block(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if constexpr (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if constexpr (bits_input <= 6) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if constexpr (bits_input <= 8) {
+    if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
+      scope_max = 4;
+      scope_min = 1;
+    }
+    else {
+      scope_max = 1;
+      scope_min = -1;
+    }
+  }
+  else{
+    scope_max = 4;
+    scope_min = -4;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+  // For SFA and SFB tensors layouts
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+  layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+  layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+  layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+  layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+
+  block_A.reset(cutlass::make_Coord(size(layout_A)));
+  block_B.reset(cutlass::make_Coord(size(layout_B)));
+  block_C.reset(cutlass::make_Coord(size(layout_C)));
+  block_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+  block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+
+  initialize_block(block_A.host_view(), seed + 2021);
+  initialize_block(block_B.host_view(), seed + 2022);
+  initialize_block(block_C.host_view(), seed + 2023);
+  initialize_block(block_SFA.host_view(), seed + 2024);
+  initialize_block(block_SFB.host_view(), seed + 2025);
+
+  block_A.sync_device();
+  block_B.sync_device();
+  block_C.sync_device();
+  block_SFA.sync_device();
+  block_SFB.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, 1},
+    { // Mainloop arguments
+      block_A.device_data(), stride_A,
+      block_B.device_data(), stride_B,
+      block_SFA.device_data(), layout_SFA,
+      block_SFB.device_data(), layout_SFB
+    },
+    { // Epilogue arguments
+      {options.alpha, options.beta},
+      block_C.device_data(), stride_C,
+      block_D.device_data(), stride_D
+    }
+  };
+
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  using namespace cute;
+  // Create the arguments for host reference implementation
+  Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+  Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+  Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+  Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,                 // ElementAccumulator
+      decltype(tensor_A),                 // TensorA
+      decltype(tensor_SFA),               // TensorSfA
+      decltype(tensor_B),                 // TensorB
+      decltype(tensor_SFB)                // TensorSfB
+    > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+  auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+  auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+
+  cutlass::reference::host::GettBlockScalingEpilogueParams<
+      ElementAccumulator,                   // ElementScalar
+      ElementAccumulator,                   // ElementAccumulator
+      ElementAccumulator,                   // ElementCompute
+      decltype(tensor_C),                   // TensorC
+      decltype(tensor_D)                    // TensorD
+    > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
+
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // Comparison
+  block_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+  passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+  passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  uint8_t* workspace = nullptr;
+  cudaError_t status = cudaMalloc(&workspace, workspace_size);
+  if (status != cudaSuccess) {
+    std::cerr << "Failed to allocate workspace memory: " << cudaGetErrorString(status) << std::endl;
+    return -1;
+  }
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Free workspace memory
+  cudaFree(workspace);
+
+  cudaDeviceSynchronize();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.9 or higher Toolkit to run this example
+  // and must have compute capability at least 100.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
+    std::cerr << "This example requires CUDA 12.9 or newer." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+
+  if (!(props.major == 10 && props.minor == 3)) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 103)." << std::endl;
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/89_sm103_fp4_ultra_gemm/CMakeLists.txt
+++ b/examples/89_sm103_fp4_ultra_gemm/CMakeLists.txt
@ -0,0 +1,38 @@
+
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+if (CUTLASS_NVCC_ARCHS MATCHES 103a)
+
+cutlass_example_add_executable(
+  89_sm103_fp4_ultra_gemm
+  89_sm103_fp4_ultra_gemm.cu
+)
+
+endif()
--- a/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu
+++ b/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu
--- a/examples/90_sm103_fp4_ultra_grouped_gemm/CMakeLists.txt
+++ b/examples/90_sm103_fp4_ultra_grouped_gemm/CMakeLists.txt
@ -0,0 +1,42 @@
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set(TEST_RANDOM_SMALL_GROUP --groups=3 --iterations=1)                              # Random problem sizes
+set(TEST_EPILOGUE_SMALL_GROUP --alpha=1.5 --beta=2.0 --groups=3 --iterations=1)     # Random problem sizes
+
+if (CUTLASS_NVCC_ARCHS MATCHES 103a)
+
+cutlass_example_add_executable(
+  90_sm103_fp4_ultra_grouped_gemm
+  90_sm103_fp4_ultra_grouped_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_RANDOM_SMALL_GROUP
+  TEST_EPILOGUE_SMALL_GROUP
+)
+
+endif() 
--- a/examples/91_fp4_gemv/91_fp4_gemv.cu
+++ b/examples/91_fp4_gemv/91_fp4_gemv.cu
@ -0,0 +1,898 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cstdint>  // uint64_t
+#include <cstdio>
+#include <cstdlib>  // rand(), RAND_MAX
+#include <string>   // std::stoi
+#include <vector>
+#include <iostream>
+#include <float.h>
+#include <optional>
+
+#include "cutlass/util/command_line.h"
+// clang-format off
+#include "cute/tensor.hpp"                               // FIX cute header file inclusion issue
+// clang-format on
+
+#include "cute/arch/mma_sm100_desc.hpp"                  // cute::UMMA::Major
+#include "cute/numeric/numeric_types.hpp"                // cute::sizeof_bits_v
+#include "cutlass/complex.h"                             // cutlass::ComplexTransform
+#include "cutlass/cutlass.h"                             // cutlass::Status
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"   // cutlass::detail::Sm1xxBlockScaledOutputConfig
+#include "cutlass/epilogue/thread/linear_combination.h"  // cutlass::epilogue::thread::LinearCombination
+#include "cutlass/gemm/device/gemv_blockscaled.h"              // cutlass::gemm::device::Gemv
+#include "cutlass/gemm/kernel/gemv_blockscaled.h"              // cutlass::gemm::kernel::Gemv
+#include "cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h"  // cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor
+#include "cutlass/gemm_coord.h"                                // cutlass::GemmCoord
+#include "cutlass/layout/matrix.h"                             // cutlass::layout::Affine2Layout_Factory
+#include "cutlass/numeric_size.h"                              // cutlss::is_subbyte
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"                         // cutlass::is_same_v
+#include "cutlass/util/device_memory.h"                        // cutlass::device_memory::allocation
+#include "cutlass/util/distribution.h"                         // cutlass::Distribution
+#include "cutlass/util/host_tensor.h"                          // cutlass::HostTensor
+#include "cutlass/util/packed_stride.hpp"                      // cutlass::make_cute_packed_stride
+#include "cutlass/util/reference/host/gemm_complex.h"          // cutlass::reference::host::GemmComplex
+#include <cutlass/util/reference/host/gett.hpp>                // cutlass::reference::host::GettBlockScalingMainloopParams
+// cutlass::reference::host::GettBlockScalingEpilogueParams
+// cutlass::reference::host::Gemm3x
+#include "cutlass/util/reference/host/tensor_compare.h"  // cutlass::reference::host::TensorEquals
+#include "cutlass/util/reference/host/tensor_fill.h"     // cutlass::reference::host::TensorFillRandomUniform
+#include "cutlass/numeric_size.h"                        // cutlass::bits_to_bytes
+
+// Helper Functions
+template <typename T>
+auto
+make_iterator(T* ptr)
+{
+  return cute::recast_ptr<T>(ptr);
+}
+
+template <typename Element, typename Layout>
+bool
+initialize_tensor(cutlass::TensorView<Element, Layout> view, cutlass::Distribution::Kind dist_kind, uint64_t seed)
+{
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    } else if (bits_input <= 6) {
+      scope_max = 2;
+      scope_min = -2;
+    } else if (bits_input <= 8) {
+      if constexpr (cutlass::is_same_v<Element, cutlass::float_ue4m3_t> ||
+                    cutlass::is_same_v<Element, cutlass::float_ue8m0_t>) {
+        scope_max = 4;
+        scope_min = 1;
+      } else {
+        scope_max = 1;
+        scope_min = -1;
+      }
+    } else {
+      scope_max = 4;
+      scope_min = -4;
+    }
+
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view, Element(0));
+  }
+
+  else {
+    CUTLASS_ASSERT(false);
+    return false;
+  }
+
+  return true;
+}
+
+// Base class of Testbed
+template <
+  typename Gemv_,
+  // The following types are more difficult to be derived from EVT
+  typename ElementC, typename LayoutC, typename ElementD_,
+  typename LayoutD, typename ElementSFD_, typename LayoutSFD,
+  typename ElementCompute_, int kVectorSize_>
+struct TestbedGemvFp4SFDBase
+{
+  public:
+  using Gemv = Gemv_;
+
+  using ElementA = typename Gemv::ElementA;
+  using ElementSFA = typename Gemv::ElementSFA;
+  using LayoutA = typename Gemv::LayoutA;
+  static_assert(cutlass::is_same_v<LayoutA, cutlass::layout::RowMajor>, "only support row major matrix A");
+  static_assert(cutlass::sizeof_bits<ElementSFA>::value == 8, "ElementSFA should be FP8 type");
+
+  using ElementB = typename Gemv::ElementB;
+  using ElementSFB = typename Gemv::ElementSFB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static_assert(cutlass::is_same_v<ElementA, ElementB>, "only support ElementA ElementB of same type");
+  static_assert(cutlass::sizeof_bits<ElementSFB>::value == 8, "ElementSFB should be FP8 type");
+
+  static_assert(cutlass::is_same_v<LayoutC, cutlass::layout::ColumnMajor>, "only support col major output D");
+
+  using ElementD = ElementD_;
+  static_assert(cutlass::is_same_v<LayoutD, cutlass::layout::ColumnMajor>, "only support col major output D");
+
+  using ElementSFD = ElementSFD_;
+  static_assert(cutlass::is_same_v<LayoutSFD, cutlass::layout::ColumnMajor>, "only support col major output SFD");
+  static_assert(cutlass::sizeof_bits<ElementSFD>::value, "only support 8 bit SFD");
+
+  using ElementAccumulator = typename Gemv::ElementAccumulator;
+  using ElementCompute = ElementCompute_;
+  static_assert(cutlass::is_same_v<ElementCompute, float>, "only support fp32 epi compute");
+
+  static constexpr int kVectorSize = kVectorSize_;
+  static_assert(kVectorSize == 16, "only support vs 16");
+
+  // SFD Config
+  static constexpr bool kIsKMajorSFD = cutlass::is_same_v<LayoutSFD, cutlass::layout::RowMajor>;
+  using Sm1xxBlockScaledOutputConfig=
+      cutlass::detail::Sm1xxBlockScaledOutputConfig<kVectorSize,
+                                                    kIsKMajorSFD ? cute::UMMA::Major::K : cute::UMMA::Major::MN>;
+  using Blk_MN_Output = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
+  using Blk_SF_Output = typename Sm1xxBlockScaledOutputConfig::Blk_SF;
+  using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+
+  // SFA SFB Config
+  using Sm100BlockScaledInputConfig = cutlass::detail::Sm1xxBlockScaledConfig<kVectorSize>;
+  using Blk_MN_Input = typename Sm100BlockScaledInputConfig::Blk_MN;
+  using Blk_SF_Input = typename Sm100BlockScaledInputConfig::Blk_SF;
+  using SfAtom_Input = typename Sm100BlockScaledInputConfig::SfAtom;
+
+  public:
+  TestbedGemvFp4SFDBase(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_D_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_SFA_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_SFB_ = cutlass::Distribution::Uniform,
+                    cutlass::Distribution::Kind init_SFD_ = cutlass::Distribution::Uniform,
+                    uint64_t seed_ = 2023)
+      : init_A(init_A_)
+      , init_B(init_B_)
+      , init_C(init_C_)
+      , init_D(init_D_)
+      , init_SFA(init_SFA_)
+      , init_SFB(init_SFB_)
+      , init_SFD(init_SFD_)
+      , seed(seed_)
+  {
+  }
+
+  bool initialize(cutlass::MatrixCoord problem_size, int32_t batch_count)
+  {
+    const int32_t gemm_m = problem_size.row();
+    const int32_t gemm_k = problem_size.column();
+    const int32_t gemm_n = 1;
+    const int32_t gemm_batch = batch_count;
+
+    // Resize Config SFA/SFB
+    auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
+    auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
+    auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
+
+    auto sfa_coord = cutlass::make_Coord(m_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
+    auto sfb_coord = cutlass::make_Coord(n_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
+
+    auto sfa_resize_layout =
+        cutlass::layout::Affine2Layout_Factory<LayoutA>::layout_factory(sfa_coord, typename LayoutA::Stride{});
+    auto sfb_resize_layout =
+        cutlass::layout::Affine2Layout_Factory<LayoutB>::layout_factory(sfb_coord, typename LayoutB::Stride{});
+
+    // Use the same SFD layout generation as reference for tensor creation
+    using ProblemShapeType = cute::Shape<int, int, int, int>;
+    auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
+    
+    // Generate the same layout as reference uses
+    auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
+    
+    // Extract size from the generated layout and create coordinate
+    auto sfd_size = cute::size(cute::filter_zeros(sfd_layout));
+    auto sfd_coord = cutlass::make_Coord(sfd_size, 1);  // Linear layout for HostTensor
+    
+    auto sfd_resize_layout =
+        cutlass::layout::Affine2Layout_Factory<LayoutSFD>::layout_factory(sfd_coord, typename LayoutSFD::Stride{});
+
+    // Resize Host
+    this->reference_D.resize({gemm_batch * gemm_m, 1});    // D col major vector
+    this->reference_SFD.resize(sfd_coord, sfd_resize_layout);
+
+    if (initialize_tensor(this->reference_D.host_view(), this->init_D, this->seed + 7) == false) {
+      printf("initialize_tensor() REF D failed\n");
+      return false;
+    }
+    if (initialize_tensor(this->reference_SFD.host_view(), this->init_SFD, this->seed + 9) == false) {
+      printf("initialize_tensor() REF SFD failed\n");
+      return false;
+    }
+
+    // Resize A/B/C/D
+    this->tensor_A.resize({gemm_batch * gemm_m, gemm_k});  // A row major
+    this->tensor_B.resize({gemm_batch * gemm_k, 1});       // B col major vector
+    this->tensor_C.resize({gemm_batch * gemm_m, 1});       // C col major vector
+    this->tensor_D.resize({gemm_batch * gemm_m, 1});       // D col major vector
+    this->tensor_SFA.resize(sfa_coord, sfa_resize_layout);
+    this->tensor_SFB.resize(sfb_coord, sfb_resize_layout);
+    this->tensor_SFD.resize(sfd_coord, sfd_resize_layout);
+
+    // Fill A/B/C
+    if (initialize_tensor(this->tensor_A.host_view(), this->init_A, this->seed + 1) == false) {
+      printf("initialize_tensor() A failed\n");
+      return false;
+    }
+    if (initialize_tensor(this->tensor_B.host_view(), this->init_B, this->seed + 2) == false) {
+      printf("initialize_tensor() B failed\n");
+      return false;
+    }
+    if (initialize_tensor(this->tensor_C.host_view(), this->init_C, this->seed + 3) == false) {
+      printf("initialize_tensor() C failed\n");
+      return false;
+    }
+
+    // Fill SFA/SFB
+    if (initialize_tensor(this->tensor_SFA.host_view(), this->init_SFA, this->seed + 4) == false) {
+      printf("initialize_tensor() SFA failed\n");
+      return false;
+    }
+    if (initialize_tensor(this->tensor_SFB.host_view(), this->init_SFB, this->seed + 5) == false) {
+      printf("initialize_tensor() SFB failed\n");
+      return false;
+    }
+
+    // Fill D/SFD
+    if (initialize_tensor(this->tensor_D.host_view(), this->init_D, this->seed + 6) == false) {
+      printf("initialize_tensor() D failed\n");
+      return false;
+    }
+    if (initialize_tensor(this->tensor_SFD.host_view(), this->init_SFD, this->seed + 8) == false) {
+      printf("initialize_tensor() SFD failed\n");
+      return false;
+    }
+
+    // Copy A/B/C from host to device
+    this->tensor_A.sync_device();
+    this->tensor_B.sync_device();
+    this->tensor_C.sync_device();
+    this->tensor_D.sync_device();
+    this->tensor_SFA.sync_device();
+    this->tensor_SFB.sync_device();
+    this->tensor_SFD.sync_device();
+
+    // SFD initialization is different.
+    // Init referenceSFD on host first, and then copy data to tensorSFD device side.
+    // This ensures tensorSFD and referenceSFD to have same data,
+    // otherwise the "bubbles" due to SFD layouts can lead to false negative sanity check.
+    cutlass::device_memory::copy_to_host(this->reference_SFD.host_data(), this->tensor_SFD.device_data(), sfd_size);
+
+    return true;
+  }
+
+  bool compare_reference()
+  {
+    // device -> host
+    this->tensor_D.sync_host();
+
+    bool passed = true;
+
+    // Check
+    passed = cutlass::reference::host::TensorEquals(this->reference_D.host_view(), this->tensor_D.host_view());
+    if (passed == false) {
+      printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
+      printf("tensorD mismatch\n");
+      return false;
+    }
+
+    this->tensor_SFD.sync_host();
+
+    passed = cutlass::reference::host::TensorEquals(this->reference_SFD.host_view(), this->tensor_SFD.host_view());
+    if (passed == false) {
+      printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
+      printf("tensorSFD mismatch\n");
+      return false;
+    }
+
+    return passed;
+  }
+
+  bool run_reference(cutlass::MatrixCoord problem_size,
+                     int32_t batch_count,
+                     ElementCompute alpha,
+                     ElementCompute beta,
+                     float epilogue_st)
+  {
+    const int32_t gemm_m = problem_size.row();
+    const int32_t gemm_k = problem_size.column();
+    const int32_t gemm_n = 1;
+    const int32_t gemm_batch = batch_count;
+
+    // Run reference blockscale GETT
+    using ProblemShapeType = cute::Shape<int, int, int, int>;
+    auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
+    auto SfD = make_tensor(make_iterator(this->reference_SFD.host_data()),
+                           Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+
+    using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+    using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+    using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+    using StrideD = cutlass::gemm::TagToStrideC_t<LayoutD>;
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(gemm_m, gemm_k, gemm_batch));
+    StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(gemm_n, gemm_k, gemm_batch));
+    StrideC stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
+    StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
+
+    auto A = make_tensor(make_iterator(this->tensor_A.host_data()),
+                         cute::make_layout(cute::make_shape(gemm_m, gemm_k, gemm_batch), stride_a));
+    auto B = make_tensor(make_iterator(this->tensor_B.host_data()),
+                         cute::make_layout(cute::make_shape(gemm_n, gemm_k, gemm_batch), stride_b));
+
+    auto C = cute::make_tensor(make_iterator(this->tensor_C.host_data()),
+                               cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_c));
+    auto D = cute::make_tensor(make_iterator(this->reference_D.host_data()),
+                               cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_d));
+
+    auto layout_sfa = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    auto layout_sfb = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+
+    auto SfA = make_tensor(this->tensor_SFA.host_data(), layout_sfa);
+    auto SfB = make_tensor(this->tensor_SFB.host_data(), layout_sfb);
+
+    // Internally scale factor of mainloop will be disabled when ElementA/B == ElementSFA/B.
+    typename cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator,  // ElementAccumulator
+                                                                      decltype(A),         // TensorA
+                                                                      decltype(SfA),       // TensorSfA
+                                                                      decltype(B),         // TensorB
+                                                                      decltype(SfB)        // TensorSfB
+                                                                      >
+        mainloop_params{A, SfA, B, SfB};
+
+    typename cutlass::reference::host::GettBlockScalingEpilogueParams<ElementCompute,      // ElementScalar
+                                                                      ElementAccumulator,  // ElementAccumulator
+                                                                      ElementCompute,      // ElementCompute
+                                                                      decltype(C),         // TensorC
+                                                                      decltype(D),         // TensorD
+                                                                      decltype(SfD),       // TensorSfD
+                                                                      cute::Int<kVectorSize>, // OutputVectorSize
+                                                                      cutlass::reference::host::SfStrategy::SfDGen
+                                                                      >
+        epilogue_params{alpha, beta, C, D, SfD, epilogue_st};
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    return true;
+  }
+
+  virtual typename Gemv::Arguments get_arguments(
+    cutlass::MatrixCoord problem_size, int32_t batch_count,
+    float epilogue_st, ElementCompute alpha, ElementCompute beta) = 0;
+
+  bool run_gemv(cutlass::MatrixCoord problem_size,
+                int32_t batch_count,
+                ElementCompute alpha,
+                ElementCompute beta,
+                [[maybe_unused]] float epilogue_st,
+                bool is_profiling,
+                int kIterations)
+  {
+
+    // Not support batch input for testing
+    const int32_t gemm_m = problem_size.row();
+    const int32_t gemm_k = problem_size.column();
+    [[maybe_unused]] const int32_t gemm_n = 1;
+    [[maybe_unused]] const int32_t gemm_batch = batch_count;
+
+    Gemv gemv_op;
+    typename Gemv::Arguments arguments = this->get_arguments(
+      problem_size, batch_count, epilogue_st, alpha, beta
+    );
+
+    cutlass::Status status = gemv_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      printf("can_implement() failed\n");
+      return false;
+    }
+
+    size_t workspace_size = Gemv::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemv_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      printf("initialize() failed\n");
+      return false;
+    }
+
+    if (not is_profiling) {
+      status = gemv_op();
+    }
+    // profiling
+    else {
+      cudaError_t result;
+      cudaEvent_t events[2];
+
+      for (cudaEvent_t &evt : events) {
+        result = cudaEventCreate(&evt);
+        if (result != cudaSuccess) {
+          std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
+          return false;
+        }
+      }
+
+      // warmup
+      status = gemv_op();
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "Device execution failed on warmup." << std::endl;
+        return false;
+      }
+
+      result = cudaEventRecord(events[0]);
+
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+     
+      for (int iter_i = 0; iter_i < kIterations; ++iter_i) {
+        status = gemv_op();
+        if (status != cutlass::Status::kSuccess) {
+          std::cerr << "Device execution failed." << std::endl;
+          return false;
+        }
+      }
+
+      result = cudaEventRecord(events[1]);
+
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+
+      result = cudaDeviceSynchronize();
+
+      if (result != cudaSuccess) {
+        std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+
+      float elapsed_ms = 0;
+      result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+
+      for (cudaEvent_t &evt : events) {
+        result = cudaEventDestroy(evt);
+        if (result != cudaSuccess) {
+          std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
+          return false;
+        }
+      }
+
+      int64_t flops = int64_t(gemm_m) * gemm_n * gemm_k * 2;
+      int64_t bytes = cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementA>) * int64_t(gemm_m) * int64_t(gemm_k)) +
+                      cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementB>) * int64_t(gemm_k) * int64_t(gemm_n)) +
+                      cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementD>) * int64_t(gemm_m) * int64_t(gemm_n)) +
+                      cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFA>) * int64_t(gemm_m) * int64_t(gemm_k) / int64_t(kVectorSize)) +
+                      cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFB>) * int64_t(gemm_k) * int64_t(gemm_n) / int64_t(kVectorSize)) +
+                      cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFD>) * int64_t(gemm_m) * int64_t(gemm_n) / int64_t(kVectorSize));
+
+      double gflops_per_second = double(flops) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1.0e9);
+      double gbytes_per_second = double(bytes) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1 << 30);
+      double elapsed_ms_per_iter = double(elapsed_ms) / kIterations;
+
+      std::cout << "         Problem: "
+                << gemm_m << "-by-" << gemm_n << "-by-" << gemm_k
+                << ", batch size: " << gemm_batch
+                << std::endl;
+      std::cout << "         Runtime: " << elapsed_ms_per_iter << " ms" << std::endl;
+      std::cout << "          GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+      std::cout << "Memory bandwidth: " << gbytes_per_second << "  GiB/s" << std::endl;
+
+    }
+
+    if (status != cutlass::Status::kSuccess) {
+      printf("gemv exec failed\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  bool run_and_verify(cutlass::MatrixCoord problem_size,
+           int32_t batch_count,
+           ElementCompute alpha,
+           ElementCompute beta,
+           float epilogue_st)
+  {
+
+    // Initialize Data
+    if (this->initialize(problem_size, batch_count) == false) {
+      return false;
+    }
+
+    // Run GEMV kernel
+    if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, false /*is_profiling*/, 1) == false) {
+      return false;
+    }
+
+    // Run Reference Kernel
+    if (this->run_reference(problem_size, batch_count, alpha, beta, epilogue_st) == false) {
+      printf("run_reference() failed\n");
+      return false;
+    }
+
+    // Verify
+    if (this->compare_reference() == false) {
+      printf("compare_reference() failed\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  bool profile(cutlass::MatrixCoord problem_size,
+           int32_t batch_count,
+           ElementCompute alpha,
+           ElementCompute beta,
+           float epilogue_st,
+           int kIterations = 10)
+  {
+    // Initialize Data
+    if (this->initialize(problem_size, batch_count) == false) {
+      return false;
+    }
+
+    // Profile GEMV kernel
+    if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, true /*is_profiling*/, kIterations) == false) {
+      return false;
+    }
+
+    return true;
+  }
+
+  public:
+  // Data Storage
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementSFA, LayoutA> tensor_SFA;
+
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementSFB, LayoutB> tensor_SFB;
+
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+
+  cutlass::HostTensor<ElementD, LayoutD> tensor_D;
+  cutlass::HostTensor<ElementSFD, LayoutD> tensor_SFD;
+
+  cutlass::HostTensor<ElementD, LayoutD> reference_D;
+  cutlass::HostTensor<ElementSFD, LayoutD> reference_SFD;
+
+  // Data Init Setting
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_D;
+  cutlass::Distribution::Kind init_SFA;
+  cutlass::Distribution::Kind init_SFB;
+  cutlass::Distribution::Kind init_SFD;
+  uint64_t seed;
+};
+
+template<typename Gemv_>
+struct TestbedGemvFp4SFD : public TestbedGemvFp4SFDBase<
+  Gemv_,
+  typename Gemv_::ElementC,
+  typename Gemv_::EpilogueOutputOp::LayoutOutput,
+  typename Gemv_::EpilogueOutputOp::ElementD,
+  typename Gemv_::EpilogueOutputOp::LayoutOutput,
+  typename Gemv_::EpilogueOutputOp::ElementSFD,
+  typename Gemv_::EpilogueOutputOp::LayoutSFD,
+  typename Gemv_::EpilogueOutputOp::ElementCompute,
+  Gemv_::EpilogueOutputOp::kVectorSize
+> {
+  using Base = TestbedGemvFp4SFDBase<
+    Gemv_,
+    typename Gemv_::ElementC,
+    typename Gemv_::EpilogueOutputOp::LayoutOutput,
+    typename Gemv_::EpilogueOutputOp::ElementD,
+    typename Gemv_::EpilogueOutputOp::LayoutOutput,
+    typename Gemv_::EpilogueOutputOp::ElementSFD,
+    typename Gemv_::EpilogueOutputOp::LayoutSFD,
+    typename Gemv_::EpilogueOutputOp::ElementCompute,
+    Gemv_::EpilogueOutputOp::kVectorSize
+  >;
+
+  using Base::Base;
+  using Gemv = Gemv_;
+  using ElementCompute = typename Base::ElementCompute;
+  using SfAtom_Input = typename Base::SfAtom_Input;
+  using Blk_MN_Input = typename Base::Blk_MN_Input;
+  using Blk_SF_Input = typename Base::Blk_SF_Input;
+
+  static constexpr int kVectorSize = Base::kVectorSize;
+
+  typename Gemv::Arguments get_arguments(
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count, float epilogue_st,
+    ElementCompute alpha, ElementCompute beta) override {
+
+    const int32_t gemm_m = problem_size.row();
+    const int32_t gemm_k = problem_size.column();
+    [[maybe_unused]] const int32_t gemm_n = 1;
+    [[maybe_unused]] const int32_t gemm_batch = batch_count;
+
+    auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
+    auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
+    auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
+
+    int batch_stride_SFA = m_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
+    int batch_stride_SFB = n_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
+
+    // Use the same SFD layout generation as reference to get correct batch stride
+    using ProblemShapeType = cute::Shape<int, int, int, int>;
+    auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
+    
+    // Generate the same layout as reference uses
+    using Sm1xxBlockScaledOutputConfig = typename Base::Sm1xxBlockScaledOutputConfig;
+    auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
+
+    // Calculate batch stride from the generated layout
+    // Extract the batch stride from the 3rd dimension stride
+    // The stride<2> gives us the stride for the batch dimension
+    auto batch_stride_tuple = cute::stride<2>(sfd_layout);  // This returns (_0, 8192)
+    int batch_stride_SFD = static_cast<int>(cute::get<1>(batch_stride_tuple));  // Extract the 8192 part
+
+    // Initialize GEMV kernel
+    typename Gemv::Arguments arguments{
+        problem_size,                        // problem_size
+        batch_count,                         // batch_count
+        typename Gemv::EpilogueOutputOp::Params{
+            this->tensor_D.device_ref(),     // tensor_d
+            this->tensor_SFD.device_data(),  // scale_factor_d_ptr
+            alpha,                           // alpha
+            beta,                            // beta
+            epilogue_st,                     // st
+            batch_stride_SFD,                // batch_stride_sfd
+            gemm_m                           // stride_d
+        },
+        this->tensor_A.device_ref(),         // ref_A
+        this->tensor_B.device_data(),        // ptr_B
+        this->tensor_C.device_data(),        // ptr_C
+        this->tensor_D.device_data(),        // ptr_D
+        this->tensor_SFA.device_data(),      // ptr_SFA
+        this->tensor_SFB.device_data(),      // ptr_SFB
+        gemm_k,                              // stride_A
+        gemm_m * gemm_k,                     // batch_stride_A
+        gemm_k,                              // batch_stride_B
+        gemm_m,                              // batch_stride_C
+        gemm_m,                              // batch_stride_D
+        batch_stride_SFA,                    // batch_stride_SFA
+        batch_stride_SFB,                    // batch_stride_SFB
+        batch_stride_SFD                     // batch_stride_SFD
+    };
+
+    return arguments;
+  }
+};
+
+struct Options {
+  bool help = false;
+
+  int m = 4096;
+  int k = 2048;
+  int n = 1;
+  int batch = 1;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float epilogue_st = -1.0f; // sentinel for random
+  
+  bool profiling = true;
+  int iterations = 10;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("batch", batch);
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta",  beta);
+    cmd.get_cmd_line_argument("epilogue_st",  epilogue_st);
+    cmd.get_cmd_line_argument("profiling", profiling);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "91_fp4_gemv\n\n"
+      << "  FP4 GEMV with block-scaled inputs and outputs.\n\n"
+      << "Options:\n\n"
+      << "  --help                                                       If specified, displays this usage statement\n\n"
+      << "  --m=<int>                                                    Sets the M extent of the GEMM\n"
+      << "  --k=<int>                                                    Sets the K extent of the GEMM\n"
+      << "  --batch=<int>                                                Sets the batch count of the GEMM\n"
+      << "  --alpha=<f32>                                                Epilogue scalar alpha\n"
+      << "  --beta=<f32>                                                 Epilogue scalar beta\n"
+      << "  --epilogue_st=<f32>                                          Epilogue ST value\n\n"
+      << "  --profiling=<bool>                                           Whether to run profiling\n\n"
+      << "  --iterations=<int>                                           Number of profiling iterations to perform\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "91_fp4_gemv" << " --m=4096 --k=2048 --batch=1 \n\n";
+
+    return out;
+  }
+};
+
+bool
+run_fp4_gemv_device(Options const& options)
+{
+  CUTLASS_ASSERT(options.n == 1);
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementSFA = cutlass::float_e4m3_t;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementSFB = cutlass::float_e4m3_t;
+
+  using ElementC = cutlass::float_e2m1_t;
+
+  using ElementD = cutlass::float_e2m1_t;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementSFD = cutlass::float_e4m3_t;
+  // Indicate SF is computed along col dim. Does NOT indicate actual layout of SFD
+  using LayoutSFD = cutlass::layout::ColumnMajor;
+
+  using ElementAccumulatorMainloop = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  ElementCompute alpha{options.alpha};
+  ElementCompute beta{options.beta};
+  // Must be a positive number.
+  const float epilogue_st = options.epilogue_st < 0.f ? 
+    static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / 5)) : 
+    options.epilogue_st;
+
+  static constexpr int kVectorSize = 16;
+  static constexpr int kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ThreadShape = cutlass::gemm::GemmShape<16, 8>;
+  static_assert(kVectorSize == ThreadShape::kM, "vector size and thread in row should be equal");
+
+  // Construct Epilogue
+  using EpilogueOp = typename cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor<kVectorSize,
+                                                                                            ThreadShape,
+                                                                                            ElementCompute,
+                                                                                            ElementAccumulator,
+                                                                                            ElementC,
+                                                                                            ElementD,
+                                                                                            ElementSFD,
+                                                                                            LayoutD,
+                                                                                            LayoutSFD>;
+
+  // Construct Mainloop
+  using Gemv = cutlass::gemm::device::GemvBlockScaled<
+      cutlass::gemm::kernel::
+          GemvBlockScaled<ElementA, LayoutA, ElementB, ElementD, ElementAccumulatorMainloop, EpilogueOp, kElementsPerAccess>>;
+
+  TestbedGemvFp4SFD<Gemv> testbed;
+  
+  bool pass = true;
+
+  if (options.profiling) {
+    pass = testbed.profile(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st, options.iterations);
+  }
+  else {
+    pass = testbed.run_and_verify(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st);
+  }
+
+  return pass;
+}
+
+int
+main(int argc, char const** argv)
+{
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Run verification
+  Options verification_options = options;
+  verification_options.profiling = false;
+  
+  bool passed = run_fp4_gemv_device(verification_options);
+  if (passed == false) {
+    printf("test fail\n");
+    return 1;
+  } else {
+    printf("test pass\n");
+  }
+
+
+  if (options.profiling) {
+    // Start profiling
+    printf("\nProfiling...\n");
+    passed = run_fp4_gemv_device(options);
+    if (passed == false) {
+      printf("profiling fail\n");
+      return 1;
+    } else {
+      printf("profiling completed\n");
+    }
+
+  }
+
+  return 0;
+#else
+  std::cerr << "Unsupported example. Please ensure CUTLASS_ARCH_MMA_SM100_SUPPORTED is defined.\n";
+  return 0;
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+}
--- a/examples/91_fp4_gemv/CMakeLists.txt
+++ b/examples/91_fp4_gemv/CMakeLists.txt
@ -0,0 +1,36 @@
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (NOT MSVC)
+
+cutlass_example_add_executable(
+  91_fp4_gemv
+  91_fp4_gemv.cu
+)
+
+endif()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -163,7 +163,12 @@ foreach(EXAMPLE
  82_blackwell_distributed_gemm
  83_blackwell_sparse_gemm
  84_blackwell_narrow_precision_sparse_gemm
+  86_blackwell_mixed_dtype_gemm                 
+  87_blackwell_geforce_gemm_blockwise
  88_hopper_fmha
+  89_sm103_fp4_ultra_gemm
+  90_sm103_fp4_ultra_grouped_gemm
+  91_fp4_gemv
  )

  add_subdirectory(${EXAMPLE})
--- a/examples/README.md
+++ b/examples/README.md
@ -1,5 +1,14 @@
 # CUTLASS - Programming Examples

+> [!IMPORTANT]
+> ### ⚠️ **Not for Benchmarking!** ⚠️
+> 
+> These examples are designed **solely for demonstrating CUTLASS functionality** and may **NOT optimized for performance benchmarking**.
+> 
+> **For accurate performance measurements**, please use the **[CUTLASS Profiler](../tools/profiler/)** instead (recommended) or manually auto-tune the example, if unavailable via the profiler.
+> 
+
+
 * [00_basic_gemm](00_basic_gemm/)
    
    launches a basic GEMM with single precision inputs and outputs
--- a/examples/common/dist_gemm_helpers.h
+++ b/examples/common/dist_gemm_helpers.h
@ -39,14 +39,13 @@
 */

 #pragma once
-
+#include "cutlass/cutlass.h"
 #include <iostream>
 #include <cuda/atomic>
-#include <cuda/std/atomic>
+#include CUDA_STD_HEADER(atomic)

 #include "cute/layout.hpp"
 #include "cute/tensor.hpp"
-#include "cutlass/cutlass.h"
 #include "cutlass/cuda_host_adapter.hpp"


--- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@ -452,8 +452,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,

  dim3 dimBlock(128);
  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
-  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
-               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
+               size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
  int  smemBytes = sizeof(SMEMStorage);

  auto* kernel_ptr = &gemm_device<SMEMStorage,
--- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
@ -528,8 +528,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,

  dim3 dimBlock(128);
  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
-  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
-               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
+               size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
  int  smemBytes = sizeof(SMEMStorage);

  auto* kernel_ptr = &gemm_device<SMEMStorage,
--- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
+++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
@ -567,8 +567,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,

  dim3 dimBlock(128);
  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
-  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
-               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
+               size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
  int  smemBytes = sizeof(SMEMStorage);

  auto* kernel_ptr = &gemm_device<SMEMStorage,
--- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
+++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
@ -575,6 +575,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
  dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
               size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
+
  int  smemBytes = sizeof(SMEMStorage);

  auto* kernel_ptr = &gemm_device<SMEMStorage,
--- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
+++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
@ -681,8 +681,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,

  dim3 dimBlock(128);
  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
-  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
-               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
+               size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
  int  smemBytes = sizeof(SMEMStorage);

  auto* kernel_ptr = &gemm_device<SMEMStorage,
--- a/examples/python/deprecated/00_basic_gemm.ipynb
+++ b/examples/python/deprecated/00_basic_gemm.ipynb
@ -69,7 +69,7 @@
    "import numpy as np\n",
    "import random\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
    "# Set to `False` to omit this information.\n",
@ -106,7 +106,7 @@
   "metadata": {},
   "source": [
    "## Declaring and running a GEMM\n",
-    "To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
+    "To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Gemm` call.\n",
    "This sets up a default GEMM operation for the given device on which you are running.\n",
    "\n",
    "Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
@ -123,7 +123,7 @@
   "source": [
    "# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
    "# specifying `element_accumulator` is not required if it is the same as `element`\n",
-    "plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
+    "plan = cutlass_cppgen.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=np.float32)\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
   ]
  },
@ -133,7 +133,7 @@
   "id": "4a5856de",
   "metadata": {},
   "source": [
-    "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
+    "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor."
   ]
  },
  {
@ -172,7 +172,7 @@
   "metadata": {},
   "source": [
    "## Changing operation modes\n",
-    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
+    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass_cppgen.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
    "\n",
    "The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
   ]
@ -197,7 +197,7 @@
    "\n",
    "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
    "\n",
-    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
+    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
   ]
  },
  {
@ -208,7 +208,7 @@
   "outputs": [],
   "source": [
    "tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
-    "plan.opclass = cutlass.OpcodeClass.Simt\n",
+    "plan.opclass = cutlass_cppgen.OpcodeClass.Simt\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
   ]
  },
@ -262,7 +262,7 @@
    "alpha = np.float16(1.)\n",
    "beta = np.float16(2.)\n",
    "\n",
-    "plan.opclass = cutlass.OpcodeClass.TensorOp\n",
+    "plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
   ]
  },
@ -336,13 +336,13 @@
    "# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
    "# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
    "if plan.cc < 90:\n",
-    "    plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
+    "    plan.swizzling_functor = cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK\n",
    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
    "else:\n",
    "    # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
-    "    td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
-    "    td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
-    "    td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
+    "    td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK\n",
    "\n",
    "    plan.compile(td)\n",
    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
@ -391,12 +391,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from cutlass.backend.utils.device import device_cc\n",
+    "from cutlass_cppgen.backend.utils.device import device_cc\n",
    "\n",
    "# 3xTF32 requires SM80 or higher\n",
    "if device_cc() >= 80:\n",
-    "    plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
-    "    plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
+    "    plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
+    "    plan.math_operation = cutlass_cppgen.MathOperation.multiply_add_fast_f32\n",
    "\n",
    "    # Create input/output tensors in FP32\n",
    "    A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
@ -433,9 +433,9 @@
    "\n",
    "# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
    "if device_cc() >= 90:\n",
-    "    plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
-    "                        layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
-    "                        layout_C=cutlass.LayoutType.ColumnMajor)\n",
+    "    plan = cutlass_cppgen.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
+    "                        layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor,\n",
+    "                        layout_C=cutlass_cppgen.LayoutType.ColumnMajor)\n",
    "\n",
    "    # Create input/output tensors in FP8\n",
    "    A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
--- a/examples/python/deprecated/01_epilogue.ipynb
+++ b/examples/python/deprecated/01_epilogue.ipynb
@ -68,7 +68,7 @@
   "source": [
    "import numpy as np\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
    "# omit this information.\n",
@ -112,7 +112,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
+    "plan = cutlass_cppgen.op.Gemm(element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
   ]
  },
--- a/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
+++ b/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
@ -75,7 +75,7 @@
    "\n",
    "## Declaring a grouped GEMM via the CUTLASS Python interface\n",
    "A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
-    "simply calls `cutlass.op.GroupedGemm`."
+    "simply calls `cutlass_cppgen.op.GroupedGemm`."
   ]
  },
  {
@ -85,11 +85,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "import torch\n",
    "\n",
    "dtype = torch.float16\n",
-    "plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
+    "plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)"
   ]
  },
  {
@ -174,7 +174,7 @@
   "outputs": [],
   "source": [
    "op = plan.construct()\n",
-    "grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
+    "grouped_gemm = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
   ]
  },
  {
@ -182,7 +182,7 @@
   "id": "c8ca3991",
   "metadata": {},
   "source": [
-    "The `cutlass.emit.pytorch` function emits:\n",
+    "The `cutlass_cppgen.emit.pytorch` function emits:\n",
    "* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
    "* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
    "* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",
--- a/examples/python/deprecated/03_basic_conv2d.ipynb
+++ b/examples/python/deprecated/03_basic_conv2d.ipynb
@ -62,7 +62,7 @@
    "import torch\n",
    "import random\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
    "# Set to `false` to omit this information.\n",
@ -80,7 +80,7 @@
    "dilation = (1, 1)\n",
    "\n",
    "# Compute the output size [N, P, Q, K]\n",
-    "N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
+    "N, P, Q, K = cutlass_cppgen.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
    "\n",
    "dtype = torch.float16\n",
    "type_A = torch.float16\n",
@ -111,7 +111,7 @@
   "source": [
    "## Declaring and running a Conv2d Fprop\n",
    "\n",
-    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
+    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
    "\n",
    "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
    "\n",
@ -125,7 +125,7 @@
   "outputs": [],
   "source": [
    "# Specifying `element_accumulator` is not required if it is the same as `element`\n",
-    "plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
+    "plan = cutlass_cppgen.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
   ]
  },
@ -133,7 +133,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
+    "There are many other ways to construct a plan from `cutlass_cppgen.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass_cppgen.op.Conv2dFprop` constructor.\n",
    "\n",
    "We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
   ]
@ -200,7 +200,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_dgrad = cutlass_cppgen.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
@ -225,7 +225,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_wgrad = cutlass_cppgen.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",
--- a/examples/python/deprecated/04_epilogue_visitor.ipynb
+++ b/examples/python/deprecated/04_epilogue_visitor.ipynb
@ -67,17 +67,17 @@
   "outputs": [],
   "source": [
    "import torch\n",
-    "import cutlass\n",
-    "from cutlass.epilogue import relu\n",
-    "from cutlass import Tensor as FakeTensor\n",
-    "from cutlass.utils.profiler import CUDAEventProfiler\n",
+    "import cutlass_cppgen\n",
+    "from cutlass_cppgen.epilogue import relu\n",
+    "from cutlass_cppgen import Tensor as FakeTensor\n",
+    "from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n",
    "\n",
    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
    "# omit this information.\n",
    "print_module = True\n",
    "\n",
    "# The Epilogue Visitor feature currently only works for SM80 and 90\n",
-    "from cutlass.backend.utils.device import device_cc\n",
+    "from cutlass_cppgen.backend.utils.device import device_cc\n",
    "if device_cc() not in [80, 90]:\n",
    "    import sys\n",
    "    sys.exit()\n",
@ -99,7 +99,7 @@
    "tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
    "tensor_D = torch.zeros_like(tensor_C)\n",
    "\n",
-    "plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
+    "plan = cutlass_cppgen.op.Gemm(element=torch.float16, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=torch.float32)"
   ]
  },
  {
@ -115,7 +115,7 @@
    "\n",
    "The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
    "\n",
-    "The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
+    "The epilogue can be generated simply through `cutlass_cppgen.evt.trace(<epilogue function>, <example_tensors>)`."
   ]
  },
  {
@ -139,7 +139,7 @@
    "bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
    "tensor_F = torch.zeros_like(tensor_D)\n",
    "examples_tensors = {\n",
-    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
+    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass_cppgen.LayoutType.RowMajor),\n",
    "    \"alpha\": alpha,\n",
    "    \"C\": tensor_C,\n",
    "    \"beta\": beta,\n",
@ -150,7 +150,7 @@
    "}\n",
    "\n",
    "# Trace the epilogue visitor\n",
-    "epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
+    "epilogue_visitor = cutlass_cppgen.epilogue.trace(example_epilogue, examples_tensors)"
   ]
  },
  {