diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc269c8b..9f423fb2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,31 @@
 # NVIDIA CUTLASS Changelog
 
+
+## [3.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.0) (2025-03-20)
+
+* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
+  - Collective mainloops that target for:
+    * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+    * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+  - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
+  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
+  - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+  - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+  - Support for [grouped GEMM with blockwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+  - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+* Added support for enhanced kernel performance search in CUTLASS:
+  - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
+  - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
+  - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
+  - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
+
 ## [3.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.8.0) (2025-01-25)
 
 * Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
@@ -538,4 +564,3 @@ SPDX-License-Identifier: BSD-3-Clause
   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65821237..1e6f298e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 
+list(APPEND CUTLASS_CUDA_NVCC_FLAGS -ftemplate-backtrace-limit=0)
+
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
 endif()
@@ -173,7 +175,7 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
 endif()
 
 if (CUDA_VERSION VERSION_GREATER_EQUAL 12.8)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a 101 101a 120 120a)
 endif()
 
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
@@ -441,7 +443,7 @@ if (NOT MSVC AND CUTLASS_NVCC_KEEP)
   # MSVC flow handles caching already, but for other generators we handle it here.
   set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
   file(MAKE_DIRECTORY ${CUTLASS_NVCC_KEEP_DIR})
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep -v) # --keep-dir may not work with nvcc for some directories.
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep -v -objtemp) # --keep-dir may not work with nvcc for some directories.
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -save-temps=${CUTLASS_NVCC_KEEP_DIR})
 endif()
 
@@ -468,6 +470,13 @@ if(UNIX)
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-fno-strict-aliasing)
 endif()
 
+# Known ctk11.4 issue (fixed later)
+# Also see https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-function-in-constexpr-if-fun
+if (CUDA_VERSION VERSION_LESS 11.5.0)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcudafe "--diag_suppress=implicit_return_from_non_void_function" )
+  message("CUDA_VERSION check pass ${CUDA_VERSION}")
+endif()
+
 # Don't leak lineinfo in release builds
 if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -gmlt)
@@ -1045,6 +1054,7 @@ function(cutlass_generate_profiler_tests NAME)
       string(REGEX REPLACE "_cluster_k_fallback=[0-9]+" "" TEST_NAME "${TEST_NAME}")
       string(REPLACE "runtime_input_datatype_a=" "" TEST_NAME "${TEST_NAME}")
       string(REPLACE "runtime_input_datatype_b=" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "swizzle_size=" "" TEST_NAME "${TEST_NAME}")
       string(REGEX REPLACE "verification_enabled=(true|false)" "" TEST_NAME "${TEST_NAME}")
       string(REGEX REPLACE "warmup_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
       string(REGEX REPLACE "profiling_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 843ed365..46506007 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -128,3 +128,35 @@ Bryce Lelbach<br />
 Joel McCormack<br />
 Kyrylo Perelygin<br />
 Sean Treichler<br />
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
index c91fc06a..176b42e4 100644
--- a/PUBLICATIONS.md
+++ b/PUBLICATIONS.md
@@ -2,10 +2,14 @@
 
 ## 2025
 
+- ["Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts"](https://arxiv.org/abs/2502.19811). Shulai Zhang, Ningxin Zheng, Haibin Lin, Ziheng Jiang, Wenlei Bao, Chengquan Jiang, Qi Hou, Weihao Cui, Size Zheng, Li-Wen Chang, Quan Chen, Xin Liu. _arXiv_, February 2025.
+
 - ["ParetoQ: Scaling Laws in Extremely Low-bit LLM Quantization"](https://arxiv.org/abs/2502.02631). Zechun Liu, Changsheng Zhao, Hanxian Huang, Sijia Chen, Jing Zhang, Jiawei Zhao, Scott Roy, Lisa Jin, Yunyang Xiong, Yangyang Shi, Lin Xiao, Yuandong Tian, Bilge Soran, Raghuraman Krishnamoorthi, Tijmen Blankevoort, Vikas Chandra. _arXiv_, February 2025.
 
 ## 2024
 
+- ["DeepSeek-V3 Technical Report"](https://arxiv.org/abs/2412.19437). DeepSeek-AI. _arXiv_, December 2024.
+
 - ["ShadowKV: KV Cache in Shadows for High-Throughput Long-Context LLM Inference"](https://arxiv.org/abs/2410.21465). Hanshi Sun, Li-Wen Chang, Wenlei Bao, Size Zheng, Ningxin Zheng, Xin Liu, Harry Dong, Yuejie Chi, Beidi Chen. _arXiv_, October 2024.
 
 - ["FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion"](https://arxiv.org/abs/2406.06858). Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Chengji Yao, Ziheng Jiang, Haibin Lin, Xin Jin, Xin Liu. _arXiv_, June 2024.
@@ -64,3 +68,35 @@
 "](https://arxiv.org/abs/2008.13006). Cong Guo, Bo Yang Hsueh, Jingwen Leng, Yuxian Qiu, Yue Guan, Zehuan Wang, Xiaoying Jia, Xipeng Li, Minyi Guo, Yuhao Zhu. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
 
 - ["Strassen's Algorithm Reloaded on GPUs"](https://dl.acm.org/doi/10.1145/3372419). Jianyu Huang, Chenhan D. Yu, Robert A. van de Geijn. _ACM Transactions on Mathematical Software_, March 2020.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/README.md b/README.md
index ada18b39..77a81620 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 3.8.0
+# CUTLASS 3.9.0
 
-_CUTLASS 3.8.0 - January 2025_
+_CUTLASS 3.9.0 - March 2025_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
@@ -38,65 +38,30 @@ See the [functionality docs](./media/docs/functionality.md) for a more comprehen
 list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU
 architecture.
 
-# What's New in CUTLASS 3.8
+# What's New in CUTLASS 3.9
 
-CUTLASS 3.8 is the first release that supports the NVIDIA Blackwell SM100 architecture.
-For a background on Blackwell's new features, please consult the PTX documentation for CUDA 12.8.
-
-* Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
-  - [5th generation Blackwell Tensor Core instructions (TCGen05)](./include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms.
-  - Extensions to [Tensor Memory Accelerator](./include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms.
-  - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp) across CuTe as a first class data locale.
-  - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](./include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe.
-  - [`make_tmem_copy()`](./include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms.
-  - Support for [new variants of LDSM on Blackwell](./include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms.
-* Support for new CUTLASS building blocks specifically for Blackwell SM100 architecture:
-  - Various narrow precision [FP4, FP6, and FP8](./include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](./include/cutlass/float_subbyte.h)
-  - [Pipelines that implement Blackwell specific synchronization](./include/cutlass/pipeline/sm100_pipeline.hpp).
-  - [Cluster launch control API supporting preferred and fallback cluster shapes](./include/cutlass/cluster_launch.hpp).
-  - Data types including NVFP4, MXFP4, MXFP6, and MXFP8 and all their supported element and scale factor types.
-  - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](./include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](./include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp).
-  - Extensions to testbeds and reference check code for unit tests and CUTLASS profiler.
-* Full support for Blackwell SM100 kernels in CUTLASS 3.x API:
-  - [Blackwell specific kernel layers](./include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that
-    + Implement a new warp-specialization recipe tuned specifically for Blackwell SM100 architecture.
-    + Leverage all the new features such as CLC based tile scheduling, preferred cluster, and TMEM based double buffering of accumulators.
-    + Support stream-K load balancing for all kernel types everywhere via composable scheduler support.
-  - Blackwell collective mainloops that target the TCGen05 MMA instructions (both SS and TS) for
-    * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp)
-    * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp)
-    * [Block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp)
-    * [Block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp)
-  - Blackwell [collective mainloop for convolution kernels](./include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad.
-  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp), [convolution](./include/cutlass/conv/dispatch_policy.hpp), and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell epilogue that supports loading accumulators from `tmem`](./include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and [full set of EVT fusions]().
-* CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification.
-  - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes.
-  - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors.
-  - Support for mixed input GEMM kernels on Hopper in the profiler.
-* New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels.
-* A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
-* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture:
-  - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
-  - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
-  - Block scaled data type GEMMs targeting Blackwell's native block scaled Tensor Cores:
-    + [NVFP4 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu)
-    + [NVFP4 inputs with NVFP4 output](./examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu)
-    + [Mixed MXFP8 and MXFP6 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu)
-  - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](./examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy.
-  - [GEMM with CLC based StreamK scheduler for load balancing](./examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu).
-  - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
-  - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
-  - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
-  - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
-* Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture:
-  - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
-  - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
-* Documentation updates:
-  - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/quickstart.md#instantiating-a-blackwell-gemm-kernel).
-  - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/blackwell_functionality.md)
-  - A new [functionality documentation](./media/docs/functionality.md) specifically for 3.x API comprehensively documenting all supported kernel types, data types, kernel features, minimum CUDA tookit support etc for 3.x supported architectures.
-  - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#Target-Architecture).
+* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
+  - Collective mainloops that target for:
+    * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+    * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+  - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
+  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
+  - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+  - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+  - Support for [grouped GEMM with blockwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+  - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+* Added support for enhanced kernel performance search in CUTLASS:
+  - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
+  - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
+  - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
+  - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
 
 Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits.
 CUTLASS team is working on a fix.
diff --git a/customConfigs.cmake b/customConfigs.cmake
index e39212db..d98fe6c5 100644
--- a/customConfigs.cmake
+++ b/customConfigs.cmake
@@ -65,10 +65,10 @@ endfunction()
 
 if(CUTLASS_BUILD_FOR_PROFILER_REGRESSIONS)
 
-    set(PROFILER_ARCH_LIST 100a)
+    set(PROFILER_ARCH_LIST 100a 101a 120a)
     foreach(ARCH IN LISTS CUTLASS_NVCC_ARCHS)
       if(NOT (ARCH IN_LIST PROFILER_ARCH_LIST))
-        message(FATAL_ERROR "Only SM100a compute capability is supported with profiler-based unit tests")
+        message(FATAL_ERROR "Only SM100a/101a/120a compute capability is supported with profiler-based unit tests")
       endif()
     endforeach()
 
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
index 9fa8297d..ed9b2727 100644
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -115,4 +115,3 @@ SPDX-License-Identifier: BSD-3-Clause
   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
-
diff --git a/examples/40_cutlass_py/README.md b/examples/40_cutlass_py/README.md
index c670e340..02222f8e 100644
--- a/examples/40_cutlass_py/README.md
+++ b/examples/40_cutlass_py/README.md
@@ -2,3 +2,35 @@
 
 This directory contains deprecated examples for PyCUTLASS, a precursor to the CUTLASS Python interface.
 For examples of using CUTLASS's actively-maintained Pythonic interface, see the [examples/python](/examples/python) directory.
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/40_cutlass_py/customizable/README.md b/examples/40_cutlass_py/customizable/README.md
index e8aeee9e..b6863fb0 100644
--- a/examples/40_cutlass_py/customizable/README.md
+++ b/examples/40_cutlass_py/customizable/README.md
@@ -165,3 +165,35 @@ Example 7: GELU
 ```python
 python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 0.0 -beta 0.5 -gm GemmSplitKParallel -k 5 -bias -activ gelu
 ```
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/55_hopper_mixed_dtype_gemm/README.md b/examples/55_hopper_mixed_dtype_gemm/README.md
index ca64c901..7c61e75c 100644
--- a/examples/55_hopper_mixed_dtype_gemm/README.md
+++ b/examples/55_hopper_mixed_dtype_gemm/README.md
@@ -41,3 +41,35 @@ We are currently optimizing the following cases:
 * Optimizations for memory bound cases.
 
 * Optimizations for scale and zero-point loading when the group size is not equal to the threadblock-k size.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md
index 4aac0536..2f3d8b83 100644
--- a/examples/59_ampere_gather_scatter_conv/README.md
+++ b/examples/59_ampere_gather_scatter_conv/README.md
@@ -207,3 +207,35 @@ With this in mind, this example kernel has the following limitations:
 - This example kernel only supports dynamic image count, all other conv problem shape must be defined as `cute::Constant<>`s
 - Problem shapes (including dynamic image count `N`) must be evenly divisible by the tile shape
 - It does not perform fp32->tf32 numeric conversion, gmem inputs must be rounded to tf32 already
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
index c9f638e6..72f59476 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
+++ b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
@@ -26,11 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-include_directories(
-  .
-)
+set(TEST_PREFETCH_CASE --m=8192 --n=64 --k=8192 --iterations=0) 
 
 cutlass_example_add_executable(
   63_hopper_gemm_with_weight_prefetch
   63_hopper_gemm_with_weight_prefetch.cu
-  )
+  TEST_COMMAND_OPTIONS
+  TEST_PREFETCH_CASE
+)
+
+target_include_directories(63_hopper_gemm_with_weight_prefetch PUBLIC .)
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/README.md b/examples/63_hopper_gemm_with_weight_prefetch/README.md
index 5dac1cc6..3fd615ff 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/README.md
+++ b/examples/63_hopper_gemm_with_weight_prefetch/README.md
@@ -74,9 +74,40 @@ echo "Overlap ratio of 0.8, prefetch ratio of 0.7"
 However, note that the example still runs a single GEMM, and most of the performance improvement
 is expected in end to end applications.
 
-
 ## Limitations
 * The parameter defaults are typically not good choices, especially `prefetch_ratio`. 
   When `prefetch_ratio` is unspecified (set to `-1.0`), the prefetch warp will `try_wait` on a 
   memory barrier before issuing every single TMA load, and in many cases this will slow down 
   prefetching to the point of being almost ineffective.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
index 0c54bc05..73655ad2 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
@@ -362,11 +362,11 @@ public:
           using ClusterSyncWithPrefetchBarrier = typename cutlass::arch::NamedBarrier;
           auto prefetcher_arrive_barrier = ClusterSyncWithPrefetchBarrier(
               blockDim.x * blockDim.y * blockDim.z,
-              /*reserved_named_barriers_*/ 14);
+              /*id*/ 0);
           // Prefetcher warp doesn't arrive on this barrier.
           auto cluster_arrive_barrier = ClusterSyncWithPrefetchBarrier(
               blockDim.x * blockDim.y * blockDim.z - NumThreadsPerWarp,
-              /*reserved_named_barriers_*/ 15);
+              /*id*/ 1);
 
           if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
             __syncwarp();
diff --git a/examples/65_distributed_gemm/README.md b/examples/65_distributed_gemm/README.md
index fc53e6bf..e3c48a9d 100644
--- a/examples/65_distributed_gemm/README.md
+++ b/examples/65_distributed_gemm/README.md
@@ -62,3 +62,36 @@ procedure is the same, simply modify the following line in the example:
 ```cpp
 using TP = _8;
 ```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
diff --git a/examples/65_distributed_gemm/REQUIREMENTS.md b/examples/65_distributed_gemm/REQUIREMENTS.md
index cc0d5632..4b8cca3b 100644
--- a/examples/65_distributed_gemm/REQUIREMENTS.md
+++ b/examples/65_distributed_gemm/REQUIREMENTS.md
@@ -84,3 +84,35 @@ GPU5   OK      OK      OK      OK      OK      X       OK      OK
 GPU6   OK      OK      OK      OK      OK      OK      X       OK
 GPU7   OK      OK      OK      OK      OK      OK      OK      X
 ```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index e4afcb30..1c21678f 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -100,7 +100,7 @@ using         LayoutB     = cutlass::layout::ColumnMajor;                   // L
 constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
 
 // C matrix configuration
-using         ElementC    = cutlass::float_e4m3_t;                          // Element type for C and D matrix operands
+using         ElementC    = float;                          // Element type for C and D matrix operands
 using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
 constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
 
@@ -251,93 +251,93 @@ struct Result
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Helper to initialize a block of device data
-  template <typename Element, typename Layout>
-  bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
 
-    if (dist_kind == cutlass::Distribution::Uniform) {
+  if (dist_kind == cutlass::Distribution::Uniform) {
 
-      double scope_max, scope_min;
-      int bits_input = cutlass::sizeof_bits<Element>::value;
-      int bits_output = cutlass::sizeof_bits<Element>::value;
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+    int bits_output = cutlass::sizeof_bits<Element>::value;
 
-      if (bits_input == 1) {
-        scope_max = 2;
-        scope_min = 0;
-      } else if (bits_input <= 8) {
-        scope_max = 2;
-        scope_min = -2;
-      } else if (bits_output == 16) {
-        scope_max = 5;
-        scope_min = -5;
-      } else {
-        scope_max = 8;
-        scope_min = -8;
-      }
-
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, scope_max, scope_min, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
-    }
-    else {
-      throw std::runtime_error("Not implementated.");
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    } else if (bits_input <= 8) {
+      scope_max = 2;
+      scope_min = -2;
+    } else if (bits_output == 16) {
+      scope_max = 5;
+      scope_min = -5;
+    } else {
+      scope_max = 8;
+      scope_min = -8;
     }
 
-    return true;
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, bits_input);
   }
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Identity) {
+
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+  else {
+    throw std::runtime_error("Not implementated.");
+  }
+
+  return true;
+}
 
 /// Helper to initialize a block of device data (scale_tensors)
-  template <typename Element, typename Layout>
-  bool initialize_scale_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
+template <typename Element, typename Layout>
+bool initialize_scale_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
 
-    if (dist_kind == cutlass::Distribution::Uniform) {
+  if (dist_kind == cutlass::Distribution::Uniform) {
 
-      double scope_max, scope_min;
+    double scope_max, scope_min;
 
-      scope_min = -1;
-      scope_max = 1;
+    scope_min = -1;
+    scope_max = 1;
 
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, scope_max, scope_min, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
-    }
-    else {
-      throw std::runtime_error("Not implementated.");
-    }
-
-    return true;
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min);
   }
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Identity) {
+
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+  else {
+    throw std::runtime_error("Not implementated.");
+  }
+
+  return true;
+}
 
 /// Initialize operands to be used in the GEMM and reference GEMM
 void initialize(const Options<RasterOrderOptions> &options) {
@@ -438,14 +438,18 @@ void initialize(const Options<RasterOrderOptions> &options) {
 
   if (IsDFp8 && options.save_amax) {
     abs_max_D.resize(cutlass::make_Coord(1));
+    initialize_tensor(abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
     abs_max_D.sync_device();
     reference_abs_max_D.resize(cutlass::make_Coord(1));
+    initialize_tensor(reference_abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
   }
 
   if (IsAuxFp8 && options.save_aux && options.save_amax) {
     abs_max_aux.resize(cutlass::make_Coord(1));
+    initialize_tensor(abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
     abs_max_aux.sync_device();
     reference_abs_max_aux.resize(cutlass::make_Coord(1));
+    initialize_tensor(reference_abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
   }
 }
 
@@ -517,10 +521,9 @@ bool verify(const Options<RasterOrderOptions> &options) {
 
   // Block scaling tensors shapes based CTA Block (TileShape) and GEMM Problem shape
   auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
-  auto blockscale_m = cute::get<0>(blockscale_shape);
-  auto blockscale_n = cute::get<1>(blockscale_shape);
-  auto blockscale_k = cute::get<2>(blockscale_shape);
+  auto blockscale_m = ceil_div(options.m, get<0>(TileShape{}));
+  auto blockscale_n = ceil_div(options.n, get<1>(TileShape{}));
+  auto blockscale_k = ceil_div(options.k, get<2>(TileShape{}));
 
   // Create instantiation for device reference gemm kernel
   auto A = cute::make_tensor(tensor_A.host_data(),
@@ -608,29 +611,40 @@ bool verify(const Options<RasterOrderOptions> &options) {
   cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
 
   // compare_reference
+  bool passed = true;
   tensor_D.sync_host();
-  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+  passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_D.host_view(), tensor_ref_D.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+  double mse = cutlass::reference::host::TensorMSE(tensor_D.host_view(), tensor_ref_D.host_view());
+  double mre = cutlass::reference::host::TensorMRE(tensor_D.host_view(), tensor_ref_D.host_view());
+  double max_error = cutlass::reference::host::TensorGreatestError(tensor_D.host_view(), tensor_ref_D.host_view());
+  std::cout << "  Result MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
 
-  if (false) {
-    std::cout << "tensor_ref_D.host_view() {" << std::endl
-              << tensor_ref_D.host_view() << std::endl
-              << "}"  << std::endl;
-    std::cout << "tensor_D.host_view() {" << std::endl
-              << tensor_D.host_view() << std::endl
-              << "}"  << std::endl;
-  }
+#if 0
+  std::cout << "tensor_ref_D.host_view() {" << std::endl
+            << tensor_ref_D.host_view() << std::endl
+            << "}"  << std::endl;
+  std::cout << "tensor_D.host_view() {" << std::endl
+            << tensor_D.host_view() << std::endl
+            << "}"  << std::endl;
+#endif
 
   if (IsDFp8 && options.save_amax) {
     abs_max_D.sync_host();
-    passed &= abs_max_D.at(cutlass::make_Coord(0)) == reference_abs_max_D.at(cutlass::make_Coord(0));
+    std::cout << "  Abs max D: " << abs_max_D.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_D.at(cutlass::make_Coord(0)) << std::endl;
+    passed &= cutlass::relatively_equal(abs_max_D.at(cutlass::make_Coord(0)), reference_abs_max_D.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
   }
 
   if (options.save_aux) {
     tensor_aux.sync_host();
-    passed &= cutlass::reference::host::TensorEquals(tensor_ref_aux.host_view(), tensor_aux.host_view());
+    passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_aux.host_view(), tensor_ref_aux.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+    mse = cutlass::reference::host::TensorMSE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    mre = cutlass::reference::host::TensorMRE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    max_error = cutlass::reference::host::TensorGreatestError(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    std::cout << "  Aux MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
     if (IsAuxFp8 && options.save_amax) {
       abs_max_aux.sync_host();
-      passed &= abs_max_aux.at(cutlass::make_Coord(0)) == reference_abs_max_aux.at(cutlass::make_Coord(0));
+      std::cout << "  Abs max aux: " << abs_max_aux.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_aux.at(cutlass::make_Coord(0)) << std::endl;
+      passed &= cutlass::relatively_equal(abs_max_aux.at(cutlass::make_Coord(0)), reference_abs_max_aux.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
     }
   }
 
@@ -671,10 +685,9 @@ int run(Options<RasterOrderOptions> &options)
 
     std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
   }
-
-  // if (!result.passed) {
-  //  exit(-1);
-  // }
+  else {
+    result.passed = true;
+  }
 
   // Run profiling loop
   if (options.iterations > 0)
@@ -707,7 +720,7 @@ int run(Options<RasterOrderOptions> &options)
     std::cout << "  GFLOPS: " << result.gflops << std::endl;
   }
 
-  return 0;
+  return result.passed;
 }
 
 #endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
@@ -753,7 +766,9 @@ int main(int argc, char const **args) {
   //
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
-  run<Gemm>(options);
+  bool passed = run<Gemm>(options);
+  if (!passed)
+    return -1;
 #endif
 
   return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index 03945764..b7cdb00a 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -100,7 +100,7 @@ using         LayoutB     = cutlass::layout::ColumnMajor;                   // L
 constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
 
 // C matrix configuration
-using         ElementC    = cutlass::float_e4m3_t;                          // Element type for C and D matrix operands
+using         ElementC    = float;                          // Element type for C and D matrix operands
 using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
 constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
 
@@ -303,93 +303,93 @@ struct Result
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Helper to initialize a block of device data
-  template <typename Element, typename Layout>
-  bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
 
-    if (dist_kind == cutlass::Distribution::Uniform) {
+  if (dist_kind == cutlass::Distribution::Uniform) {
 
-      double scope_max, scope_min;
-      int bits_input = cutlass::sizeof_bits<Element>::value;
-      int bits_output = cutlass::sizeof_bits<Element>::value;
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+    int bits_output = cutlass::sizeof_bits<Element>::value;
 
-      if (bits_input == 1) {
-        scope_max = 2;
-        scope_min = 0;
-      } else if (bits_input <= 8) {
-        scope_max = 2;
-        scope_min = -2;
-      } else if (bits_output == 16) {
-        scope_max = 5;
-        scope_min = -5;
-      } else {
-        scope_max = 8;
-        scope_min = -8;
-      }
-
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, scope_max, scope_min, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
-    }
-    else {
-      throw std::runtime_error("Not implementated.");
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    } else if (bits_input <= 8) {
+      scope_max = 2;
+      scope_min = -2;
+    } else if (bits_output == 16) {
+      scope_max = 5;
+      scope_min = -5;
+    } else {
+      scope_max = 8;
+      scope_min = -8;
     }
 
-    return true;
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, bits_input);
   }
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Identity) {
+
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+  else {
+    throw std::runtime_error("Not implementated.");
+  }
+
+  return true;
+}
 
 /// Helper to initialize a block of device data (scale_tensors)
-  template <typename Element, typename Layout>
-  bool initialize_scale_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
+template <typename Element, typename Layout>
+bool initialize_scale_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
 
-    if (dist_kind == cutlass::Distribution::Uniform) {
+  if (dist_kind == cutlass::Distribution::Uniform) {
 
-      double scope_max, scope_min;
+    double scope_max, scope_min;
 
-      scope_min = -1;
-      scope_max = 1;
+    scope_min = -1;
+    scope_max = 1;
 
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, scope_max, scope_min, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
-    }
-    else {
-      throw std::runtime_error("Not implementated.");
-    }
-
-    return true;
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min);
   }
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Identity) {
+
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+  else {
+    throw std::runtime_error("Not implementated.");
+  }
+
+  return true;
+}
 
 /// Initialize operands to be used in the GEMM and reference GEMM
 template <typename GroupScaleConfig>
@@ -403,11 +403,9 @@ void initialize(const Options<RasterOrderOptions> &options) {
   assert(options.n % ScaleGranularityN == 0);
 
   // Find Group Scaling tensor shapes based on `ScaleGranularityM`, problem shape, and TileShape
-  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
-  auto groupscale_m = cute::get<0>(gemm_problem_shape) / ScaleGranularityM;
-  auto groupscale_n = cute::get<1>(gemm_problem_shape) / ScaleGranularityN;
-  auto blockscale_k = cute::get<2>(blockscale_shape);
+  auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
+  auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
+  auto blockscale_k = ceil_div(options.k, cute::get<2>(TileShape{}));
 
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
@@ -582,13 +580,11 @@ bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile
   const int ScaleGranularityN = get<1>(TileShape_{}) / ScaleNsPerTile;
 
   // Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
-  auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape_{})));
-  auto blockscale_m = cute::get<0>(blockscale_shape);
-  auto blockscale_n = cute::get<1>(blockscale_shape);
-  auto blockscale_k = cute::get<2>(blockscale_shape);
-  auto groupscale_m = get<0>(gemm_problem_shape) / ScaleGranularityM;
-  auto groupscale_n = get<1>(gemm_problem_shape) / ScaleGranularityN;
+  auto blockscale_m = ceil_div(options.m, get<0>(TileShape_{}));
+  auto blockscale_n = ceil_div(options.n, get<1>(TileShape_{}));
+  auto blockscale_k = ceil_div(options.k, get<2>(TileShape_{}));
+  auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
+  auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
 
   // Create instantiation for device reference gemm kernel
   auto A = cute::make_tensor(tensor_A.host_data(),
@@ -676,8 +672,13 @@ bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile
   cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
 
   // compare_reference
+  bool passed = true;
   tensor_D.sync_host();
-  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+  passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_D.host_view(), tensor_ref_D.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+  double mse = cutlass::reference::host::TensorMSE(tensor_D.host_view(), tensor_ref_D.host_view());
+  double mre = cutlass::reference::host::TensorMRE(tensor_D.host_view(), tensor_ref_D.host_view());
+  double max_error = cutlass::reference::host::TensorGreatestError(tensor_D.host_view(), tensor_ref_D.host_view());
+  std::cout << "  Result MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
 
 #if 0
   std::cout << "tensor_ref_D.host_view() {" << std::endl
@@ -690,15 +691,21 @@ bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile
 
   if (IsDFp8 && options.save_amax) {
     abs_max_D.sync_host();
-    passed &= abs_max_D.at(cutlass::make_Coord(0)) == reference_abs_max_D.at(cutlass::make_Coord(0));
+    std::cout << "  Abs max D: " << abs_max_D.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_D.at(cutlass::make_Coord(0)) << std::endl;
+    passed &= cutlass::relatively_equal(abs_max_D.at(cutlass::make_Coord(0)), reference_abs_max_D.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
   }
 
   if (options.save_aux) {
     tensor_aux.sync_host();
-    passed &= cutlass::reference::host::TensorEquals(tensor_ref_aux.host_view(), tensor_aux.host_view());
+    passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_aux.host_view(), tensor_ref_aux.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+    mse = cutlass::reference::host::TensorMSE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    mre = cutlass::reference::host::TensorMRE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    max_error = cutlass::reference::host::TensorGreatestError(tensor_aux.host_view(), tensor_ref_aux.host_view());
+    std::cout << "  Aux MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
     if (IsAuxFp8 && options.save_amax) {
       abs_max_aux.sync_host();
-      passed &= abs_max_aux.at(cutlass::make_Coord(0)) == reference_abs_max_aux.at(cutlass::make_Coord(0));
+      std::cout << "  Abs max aux: " << abs_max_aux.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_aux.at(cutlass::make_Coord(0)) << std::endl;
+      passed &= cutlass::relatively_equal(abs_max_aux.at(cutlass::make_Coord(0)), reference_abs_max_aux.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
     }
   }
 
@@ -716,29 +723,29 @@ int run(Options<RasterOrderOptions> &options)
   const int ScaleNsPerTile    = GroupScaleConfig::ScaleNsPerTile;
 
   bool skip = false;
-
-  if (options.m % ScaleGranularityM != 0) {
-    std::cout << "Skippig (m size: " << options.m << " less then ScaleGranularityM: " << ScaleGranularityM << "):" << std::endl;
-    skip = true;
-  }
-
-  if (options.n % ScaleGranularityN != 0) {
-    std::cout << "Skippig (n size: " << options.m << " less then ScaleGranularityN: " << ScaleGranularityM << "):" << std::endl;
-    skip = true;
-  }
-
-  if (options.k % size<2>(TileShape{}) != 0) {
-    std::cout << "Skippig (k size: " << options.k << " less then TileShape[2]: " << size<2>(TileShape{}) << "):" << std::endl;
-    skip = true;
-  }
-
-  if (!skip) std::cout << "Running: " << std::endl;
   std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
   std::cout << "  Tile shape (M, N, K): " << size<0>(TileShape{}) << ", " << size<1>(TileShape{}) << ", " << size<2>(TileShape{}) << std::endl;
   std::cout << "  ScaleGranularityM: " << ScaleGranularityM << " (ScaleMsPerTile: " << ScaleMsPerTile << ")" << std::endl;
   std::cout << "  ScaleGranularityN: " << ScaleGranularityN << " (ScaleNsPerTile: " << ScaleNsPerTile << ")" << std::endl;
 
-  if (skip) return -1;
+
+  if (options.m < ScaleGranularityM) {
+    std::cout << "  Skippig (m size: " << options.m << " less than ScaleGranularityM: " << ScaleGranularityM << "):" << std::endl;
+    skip = true;
+  }
+
+  if (options.n < ScaleGranularityN) {
+    std::cout << "  Skippig (n size: " << options.n << " less than ScaleGranularityN: " << ScaleGranularityN << "):" << std::endl;
+    skip = true;
+  }
+
+  if (options.k < size<2>(TileShape{})) {
+    std::cout << "  Skippig (k size: " << options.k << " less than TileShape[2]: " << size<2>(TileShape{}) << "):" << std::endl;
+    skip = true;
+  }
+
+  if (!skip) std::cout << "  Running... " << std::endl;
+  else return -1;
 
   initialize<GroupScaleConfig>(options);
 
@@ -770,17 +777,17 @@ int run(Options<RasterOrderOptions> &options)
 
     std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
   }
-
-  if (!result.passed) {
-   exit(-1);
+  else {
+    result.passed = true;
   }
 
   // Run profiling loop
   if (options.iterations > 0)
   {
     GpuTimer timer;
-    timer.start();
-    for (int iter = 0; iter < options.iterations; ++iter) {
+    for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
+      if (iter == options.warmup)
+        timer.start();
       CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
       CUTLASS_CHECK(gemm.run());
     }
@@ -806,7 +813,7 @@ int run(Options<RasterOrderOptions> &options)
     fflush(stdout);
   }
 
-  return 0;
+  return result.passed;
 }
 
 #endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
@@ -852,27 +859,31 @@ int main(int argc, char const **args) {
   //
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  bool passed = true;
   std::cout << "Basic split-K GEMM kernel" << std::endl;
-  run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmDefault>(options);
+  passed &= run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmDefault>(options);
   std::cout << std::endl;
-  run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmDefault>(options);
+  passed &= run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmDefault>(options);
   std::cout << std::endl;
-  run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmDefault>(options);
+  passed &= run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmDefault>(options);
   std::cout << std::endl;
-  run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmDefault>(options);
+  passed &= run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmDefault>(options);
   std::cout << std::endl;
 
   std::cout << std::endl;
 
   std::cout << "StreamK GEMM kernel" << std::endl;
-  run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmStreamK>(options);
+  passed &= run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmStreamK>(options);
   std::cout << std::endl;
-  run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmStreamK>(options);
+  passed &= run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmStreamK>(options);
   std::cout << std::endl;
-  run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmStreamK>(options);
+  passed &= run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmStreamK>(options);
   std::cout << std::endl;
-  run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmStreamK>(options);
+  passed &= run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmStreamK>(options);
   std::cout << std::endl;
+
+  if (!passed)
+    return -1;
 #endif
 
   return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
index 23f05ada..85aff756 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
@@ -46,6 +46,8 @@ struct Options {
   int m = 1024, n = 512, k = 1024, l = 1;
   RasterOrderOptions raster;
   int swizzle;
+  float epsilon = 0.02f;
+  float non_zero_floor = 1.f;
 
   // Parses the command line
   void parse(int argc, char const **args) {
@@ -73,6 +75,8 @@ struct Options {
     cmd.get_cmd_line_argument("warmup", warmup);
     cmd.get_cmd_line_argument("iterations", iterations);
     cmd.get_cmd_line_argument("verify", verify);
+    cmd.get_cmd_line_argument("epsilon", epsilon);
+    cmd.get_cmd_line_argument("non-zero-floor", non_zero_floor);
 
     char raster_char;
     cmd.get_cmd_line_argument("raster", raster_char);
@@ -113,7 +117,10 @@ struct Options {
       << "  --save_amax=<bool>          Save the pre-scaled max absolute value of any fp8 outputs (aux and/or D) (default: true)\n"
       << "  --raster=<char>             CTA Rasterization direction (N for along N, M for along M, and H for heuristic)\n\n"
       << "  --swizzle=<int>             CTA Rasterization swizzle\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --verify=<bool>             Verify the results.\n\n"
+      << "  --epsilon=<float>           The epsilon value for comparing the results.\n\n"
+      << "  --non-zero-floor=<float>    The none zero floor for comparing the results.\n\n";
 
     out
       << "\n\nExamples:\n\n"
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
index 6bb593bd..0bf90a41 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
@@ -221,9 +221,9 @@ void gett_mainloop(
   const int N = cute::size<0>(mainloop_params.B.layout());
   const int ScaleGranularityM = M / cute::size<0>(mainloop_params.ScaleA);
   const int ScaleGranularityN = N / cute::size<0>(mainloop_params.ScaleB);
-  assert(ScaleGranularityM && M % ScaleGranularityM == 0 
+  assert(ScaleGranularityM && M % ScaleGranularityM == 0
     && "ScaleGranularityM must divide M");
-  assert(ScaleGranularityN && N % ScaleGranularityN == 0 
+  assert(ScaleGranularityN && N % ScaleGranularityN == 0
     && "ScaleGranularityN must divide N"); 
 
   cute::Tensor blockscale_A = domain_offset(
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/README.md b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
index 272d36e5..f4d71ea3 100644
--- a/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
@@ -12,3 +12,35 @@ Note that in Example 55, the argument `--g` is used to determine the block scale
 ## Upcoming features
 
 Currently, the Mixed-input Grouped GEMM only supports row-wise scaling. Please contact us if zero-points or block-wise scaling are needed.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
index 3cee6caf..19d6b89d 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
@@ -194,12 +194,14 @@ struct Options {
   float alpha, beta;
   int iterations;
   int m, n, k;
+  int swizzle;
 
   Options():
     help(false),
     m(8192), n(8192), k(8192),
     alpha(1.f), beta(0.f),
-    iterations(10)
+    iterations(10),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -217,6 +219,7 @@ struct Options {
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -231,6 +234,7 @@ struct Options {
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
       << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n\n"
       << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out
@@ -331,6 +335,8 @@ typename Gemm::Arguments args_from_options(const Options &options)
     {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
   };
 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+
   return arguments;
 }
 
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
index 69a36310..d476ce00 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
@@ -231,6 +231,7 @@ struct Options {
   bool save_amax = true;
   int iterations = 1000;
   int m = 1024, n = 512, k = 1024, l = 1;
+  int swizzle = 0;
 
   // Parses the command line
   void parse(int argc, char const **args) {
@@ -256,6 +257,7 @@ struct Options {
     cmd.get_cmd_line_argument("save_aux", save_aux, true);
     cmd.get_cmd_line_argument("save_amax", save_amax, true);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -271,6 +273,7 @@ struct Options {
       << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
       << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
       << "  --scale_a=<f32>             Scaling factor for A\n"
       << "  --scale_b=<f32>             Scaling factor for B\n"
       << "  --scale_c=<f32>             Scaling factor for C\n"
@@ -476,6 +479,8 @@ typename Gemm::Arguments args_from_options(const Options &options)
     fusion_args.amax_D_ptr = abs_max_D.device_data();
   }
 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+
   return arguments;
 }
 
diff --git a/examples/70_blackwell_gemm/CMakeLists.txt b/examples/70_blackwell_gemm/CMakeLists.txt
index cb401e3a..0ac1687d 100644
--- a/examples/70_blackwell_gemm/CMakeLists.txt
+++ b/examples/70_blackwell_gemm/CMakeLists.txt
@@ -28,14 +28,29 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+set(TEST_SWIZZLE_1 --swizzle=1)
+set(TEST_SWIZZLE_2 --swizzle=2)
+set(TEST_SWIZZLE_5 --swizzle=5)
+set(TEST_SWIZZLE_5_UNEVEN --swizzle=5 --m=4096 --n=16384)
+
+if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
 cutlass_example_add_executable(
   70_blackwell_fp16_gemm
   70_blackwell_fp16_gemm.cu
-)  
+  TEST_COMMAND_OPTIONS
+  TEST_SWIZZLE_1
+  TEST_SWIZZLE_2
+  TEST_SWIZZLE_5
+  TEST_SWIZZLE_5_UNEVEN
+)
 
 cutlass_example_add_executable(
   70_blackwell_fp8_gemm
   70_blackwell_fp8_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_SWIZZLE_1
+  TEST_SWIZZLE_2
+  TEST_SWIZZLE_5
+  TEST_SWIZZLE_5_UNEVEN
 )
 endif()
diff --git a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
index 427af254..f911262f 100644
--- a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
+++ b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
@@ -74,12 +74,14 @@ struct Options {
 
   int m, n, k, l;
   float alpha, beta;
+  int swizzle;
 
   Options():
     help(false),
     error(false),
     m(2048), n(2048), k(2048), l(1),
-    alpha(1.f), beta(0.f)
+    alpha(1.f), beta(0.f),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -97,6 +99,7 @@ struct Options {
     cmd.get_cmd_line_argument("l", l, 1);
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -112,7 +115,8 @@ struct Options {
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n";
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n\n";
 
     return out;
   }
@@ -352,6 +356,8 @@ struct ExampleRunner {
       hw_info
     };
 
+    arguments.scheduler.max_swizzle_size = options.swizzle;
+
     // See example 48 for details on custom EVT construction
     if constexpr (UseCustomEVT) {
       arguments.epilogue.thread =
diff --git a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
index f7e12fbf..f729b43d 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
@@ -211,12 +211,14 @@ struct Options {
   float alpha, beta;
   int iterations;
   int m, n, k;
+  int swizzle = 0;
 
   Options():
     help(false),
     m(1024), n(1024), k(1024),
     alpha(1.f), beta(0.f),
-    iterations(10)
+    iterations(10),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -234,6 +236,7 @@ struct Options {
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -247,7 +250,8 @@ struct Options {
       << "  --n=<int>                   Sets the N extent of the GEMM\n"
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
       << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
@@ -333,7 +337,7 @@ bool initialize_block(
 void initialize(const Options &options) {
   using namespace cute;
   // For SFA and SFB tensors layouts
-  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
 
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -344,8 +348,8 @@ void initialize(const Options &options) {
   layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
   layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
   layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
-  layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
-  layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
 
   block_A.reset(cutlass::make_Coord(size(layout_A)));
   block_B.reset(cutlass::make_Coord(size(layout_B)));
@@ -387,6 +391,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
     }
   };
 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
   return arguments;
 }
 
diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
index 2719cab9..75d3437d 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@@ -177,7 +177,7 @@ using LayoutD   = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
 
 using FusionOp = typename Gemm::EpilogueOutputOp;
 constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
-using SfdOutputCfg = cutlass::detail::Sm100BlockScaledOutputConfig<OutputSFVectorSize>;
+using SfdOutputCfg = cutlass::detail::Sm1xxBlockScaledOutputConfig<OutputSFVectorSize>;
 using LayoutSFD = typename SfdOutputCfg::LayoutSF;
 
 //
@@ -240,12 +240,14 @@ struct Options {
   float alpha, beta;
   int iterations;
   int m, n, k;
+  int swizzle = 0;
 
   Options():
     help(false),
     m(1024), n(1024), k(1024),
     alpha(1.f), beta(0.f),
-    iterations(10)
+    iterations(10),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -263,6 +265,7 @@ struct Options {
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -276,7 +279,8 @@ struct Options {
       << "  --n=<int>                   Sets the N extent of the GEMM\n"
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
       << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
@@ -362,9 +366,9 @@ bool initialize_block(
 void initialize(const Options &options) {
   using namespace cute;
   // For SFA and SFB tensors layouts
-  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
   // For SFD tensor layout
-  using Sm100BlockScaledOutputConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+  using Sm1xxBlockScaledOutputConfig=  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
 
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -375,8 +379,8 @@ void initialize(const Options &options) {
   layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
   layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
   layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
-  layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
-  layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
   layout_SFD = SfdOutputCfg::tile_atom_to_shape_SFD(cute::make_shape(options.m, options.n, options.k, 1));
 
   block_A.reset(cutlass::make_Coord(size(layout_A)));
@@ -432,6 +436,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
     arguments.epilogue.thread.norm_constant_ptr      = block_Normconst.device_data();
   }
 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
   return arguments;
 }
 
diff --git a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
index 2784d050..1d6c1f3c 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
@@ -212,12 +212,14 @@ struct Options {
   float alpha, beta;
   int iterations;
   int m, n, k;
+  int swizzle = 0;
 
   Options():
     help(false),
     m(1024), n(1024), k(1024),
     alpha(1.f), beta(0.f),
-    iterations(10)
+    iterations(10),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -235,6 +237,7 @@ struct Options {
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
   }
 
   /// Prints the usage statement.
@@ -248,7 +251,8 @@ struct Options {
       << "  --n=<int>                   Sets the N extent of the GEMM\n"
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
       << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
@@ -334,7 +338,7 @@ bool initialize_block(
 void initialize(const Options &options) {
   using namespace cute;
   // For SFA and SFB tensors layouts
-  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
 
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
   stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -345,8 +349,8 @@ void initialize(const Options &options) {
   layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
   layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
   layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
-  layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
-  layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
 
   block_A.reset(cutlass::make_Coord(size(layout_A)));
   block_B.reset(cutlass::make_Coord(size(layout_B)));
@@ -388,6 +392,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
     }
   };
 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
   return arguments;
 }
 
diff --git a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
index 19c4efd1..67b82a6e 100644
--- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
+++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
@@ -214,7 +214,8 @@ struct Options {
   int iterations;
   int m, n, k;
   int preferred_cluster_m, preferred_cluster_n, fallback_cluster_m, fallback_cluster_n;
-  
+  int swizzle = 0;
+
   Options():
     help(false),
     m(4096), n(4096), k(4096),
@@ -223,7 +224,8 @@ struct Options {
     preferred_cluster_m(4),
     preferred_cluster_n(4),
     fallback_cluster_m(2),
-    fallback_cluster_n(1)
+    fallback_cluster_n(1),
+    swizzle(0)
   { }
 
   // Parses the command line
@@ -245,6 +247,7 @@ struct Options {
     cmd.get_cmd_line_argument("preferred_cluster_n", preferred_cluster_n, 4);
     cmd.get_cmd_line_argument("fallback_cluster_m", fallback_cluster_m, 2);
     cmd.get_cmd_line_argument("fallback_cluster_n", fallback_cluster_n, 1);
+    cmd.get_cmd_line_argument("swizzle", swizzle);
 
     if (!validate_cluster_shape()){
       std::cout << "--Invalid cluster shapes" << std::endl;
@@ -265,6 +268,7 @@ struct Options {
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
       << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --swizzle=<int>             Cluster rasterization swizzle\n"
       << "  --preferred_cluster_m=<str> Sets the M extent of preferred cluster shape\n"
       << "  --preferred_cluster_n=<str> Sets the N extent of preferred cluster shape\n"
       << "  --fallback_cluster_m=<str>  Sets the M extent of fallback cluster shape\n"
@@ -384,7 +388,8 @@ typename Gemm::Arguments args_from_options(const Options &options) {
 
   arguments.hw_info.cluster_shape = dim3(options.preferred_cluster_m, options.preferred_cluster_n,1);
   arguments.hw_info.cluster_shape_fallback = dim3(options.fallback_cluster_m, options.fallback_cluster_n,1);
-  
+
+  arguments.scheduler.max_swizzle_size = options.swizzle;
   return arguments;
 }
 
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
index 1d8db6e2..ad563a4b 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@@ -242,6 +242,7 @@ using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTil
 struct Options {
 
   bool help = false;
+  bool use_pdl = false;
 
   float alpha = FLT_MAX;
   float beta  = FLT_MAX;
@@ -264,6 +265,9 @@ struct Options {
       help = true;
       return;
     }
+    if (cmd.check_cmd_line_flag("use_pdl")) {
+      use_pdl = true;
+    }
 
     cmd.get_cmd_line_argument("m", m);
     cmd.get_cmd_line_argument("n", n);
@@ -387,7 +391,8 @@ struct Options {
       << "  --raster=<char>                                              CTA Rasterization direction (N for along N, M for along M)\n\n"
       << "  --iterations=<int>                                           Number of profiling iterations to perform\n\n"
       << "  --benchmark=<str>                                            Executes a benchmark problem size\n"
-      << "  --max_sm_count=<int>                                         Run kernels using only these number of SMs\n";
+      << "  --max_sm_count=<int>                                         Run kernels using only these number of SMs\n"
+      << "  --use_pdl                                                    Launch kernel with PDL (Programmatic Dependent Launch) enabled\n";
                                                                                              
     out
       << "\n\nExamples:\n\n"
@@ -711,7 +716,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
   CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
 
   // Correctness / Warmup iteration
-  CUTLASS_CHECK(gemm.run());
+  CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
   Result result;
@@ -730,7 +735,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     timer.start();
     for (int iter = 0; iter < options.iterations; ++iter) {
       CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
-      CUTLASS_CHECK(gemm.run());
+      CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
     }
     timer.stop();
 
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
index ee697135..d5814c0a 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@@ -219,14 +219,14 @@ using StrideD = typename Gemm::GemmKernel::InternalStrideD;
 
 using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
 using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
-using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
-using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
+using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
                                         OutputSFVectorSize, 
                                         cute::is_same_v<typename FusionOperation::GmemLayoutTagScalefactor,
                                             cutlass::layout::RowMajor> ? cute::UMMA::Major::K : cute::UMMA::Major::MN
                                      >;
-using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
-using LayoutSFD = typename Sm100BlockScaledOutputConfig::LayoutSF;
+using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
 
 // Host-side allocations
 std::vector<StrideA> stride_A_host;
@@ -305,6 +305,7 @@ struct Options {
 
   bool help = false;
   bool verification = true;
+  bool use_pdl = false;
 
   float alpha = FLT_MAX;
   float beta  = FLT_MAX;
@@ -328,9 +329,12 @@ struct Options {
       help = true;
       return;
     }
-    if (cmd.check_cmd_line_flag("no-verif")) {
+    if (cmd.check_cmd_line_flag("no_verif")) {
       verification = false;
     }
+    if (cmd.check_cmd_line_flag("use_pdl")) {
+      use_pdl = true;
+    }
 
     cmd.get_cmd_line_argument("m", m);
     cmd.get_cmd_line_argument("n", n);
@@ -457,7 +461,8 @@ struct Options {
       << "  --iterations=<int>                                           Number of profiling iterations to perform\n\n"
       << "  --benchmark=<str>                                            Executes a benchmark problem size\n"
       << "  --max_sm_count=<int>                                         Run kernels using only these number of SMs\n"
-      << "  --no-verif                                                   Do not run (host-side) verification kernels\n";
+      << "  --no_verif                                                   Do not run (host-side) verification kernels\n"
+      << "  --use_pdl                                                    Launch kernel with PDL (Programmatic Dependent Launch) enabled\n";
 
     out
       << "\n\nExamples:\n\n"
@@ -554,9 +559,9 @@ void allocate(const Options &options) {
     auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
     auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
     auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
-    auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
-    auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
-    auto layout_SFD = Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+    auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+    auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+    auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
 
     stride_A_host.push_back(stride_A);
     stride_B_host.push_back(stride_B);
@@ -775,9 +780,9 @@ bool verify(const Options &options) {
     auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
     auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
     auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
-    auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
-    auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
-    auto layout_SFD = Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+    auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+    auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+    auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
 
     // Create the arguments for host reference implementation
     Tensor tensor_A = make_tensor(make_iterator(block_A.at(i).host_data()), layout_A);
@@ -845,7 +850,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
   CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
 
   // Correctness / Warmup iteration
-  CUTLASS_CHECK(gemm.run());
+  CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
 
   cudaDeviceSynchronize();
 
@@ -870,7 +875,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     timer.start();
     for (int iter = 0; iter < options.iterations; ++iter) {
       CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
-      CUTLASS_CHECK(gemm.run());
+      CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
     }
     timer.stop();
 
diff --git a/examples/77_blackwell_fmha/README.md b/examples/77_blackwell_fmha/README.md
index 8766f081..2f4c9c76 100644
--- a/examples/77_blackwell_fmha/README.md
+++ b/examples/77_blackwell_fmha/README.md
@@ -21,3 +21,35 @@ To modify the code for fusions, `collective/fmha_fusion.hpp` provides the easies
 The `apply_mask` function is called with the accumulator of the first GEMM and the logical positions of those elements.
 It is well-suited for applying masks or activations.
 More complex fusions that require memory loads would require modifying the mainloop collective to orchestrate the load via TMA.
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
new file mode 100644
index 00000000..058c4b2b
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
@@ -0,0 +1,546 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+    This example demonstrates a simple way to instantiate and run a blockscaled NVFP4 GEMM on the NVIDIA Blackwell SM120 architecture.
+    This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+    The Blackwell SM120 CUTLASS kernel uses the new Block Scaled Tensor Core MMA Instructions (mma.sync.aligned.block_scale).
+    NVFP4 MMA has 2x throughput compared to MXFP8 MMA and 4x throughput compared to Ada Tensor Core FP8 MMA. 
+    (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+
+    This kernel leverages:
+    1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+    2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+    3. Block Scaled Tensor Core MMA Instructions
+    4. Epilogue Optimization
+    
+    Note that GeForce RTX 50 series GPUs do not support:
+    1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+    2. Dynamic datatypes.
+    
+    Usage:
+
+      $ ./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include <iostream>
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::nv_float4_t<cutlass::float_e2m1_t>;    // Element type for A matrix operand
+using         LayoutATag  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 32;                                             // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::nv_float4_t<cutlass::float_e2m1_t>;    // Element type for B matrix operand
+using         LayoutBTag  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 32;                                             // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                            // Element type for D matrix operand
+using         ElementC    = cutlass::bfloat16_t;                            // Element type for C matrix operand
+using         LayoutCTag  = cutlass::layout::RowMajor;                      // Layout type for C matrix operand
+using         LayoutDTag  = cutlass::layout::RowMajor;                      // Layout type for D matrix operand
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm120;                           // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape    = Shape<_128,_128,_128>;                          // Threadblock's tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,                      
+    ThreadBlockShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutCTag, AlignmentC,
+    ElementD, LayoutDTag, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto                      // Epilogue schedule policy
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutATag, AlignmentA,
+    ElementB, LayoutBTag, AlignmentB,
+    ElementAccumulator,
+    ThreadBlockShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto                             // Kernel schedule policy. Auto defaults to cooperative kernel schedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,                                                   // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference device GEMM implementation type
+using StrideA   = typename Gemm::GemmKernel::StrideA;
+using LayoutA   = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB   = typename Gemm::GemmKernel::StrideB;
+using LayoutB   = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC   = typename Gemm::GemmKernel::StrideC;
+using LayoutC   = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD   = typename Gemm::GemmKernel::StrideD;
+using LayoutD   = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor<ElementA::DataType, cutlass::layout::PackedVectorLayout> block_A;
+cutlass::HostTensor<ElementA::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFA;
+cutlass::HostTensor<ElementB::DataType, cutlass::layout::PackedVectorLayout> block_B;
+cutlass::HostTensor<ElementB::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFB;
+cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
+// Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
+// Reference Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k;
+
+  Options():
+    help(false),
+    m(1024), n(1024), k(1024),
+    alpha(1.f), beta(0.f),
+    iterations(10)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "79a_blackwell_geforce_nvfp4_bf16_gemm\n\n"
+      << "  Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ " << "./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_block(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if constexpr (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if constexpr (bits_input <= 6) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if constexpr (bits_input <= 8) {
+    if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
+      scope_max = 4;
+      scope_min = 1;
+    }
+    else {
+      scope_max = 1;
+      scope_min = -1;
+    }
+  }
+  else{
+    scope_max = 4;
+    scope_min = -4;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+  
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+  // For SFA and SFB tensors layouts
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+  layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+  layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+  layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+  layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+
+  block_A.reset(cutlass::make_Coord(size(layout_A)));
+  block_B.reset(cutlass::make_Coord(size(layout_B)));
+  block_C.reset(cutlass::make_Coord(size(layout_C)));
+  block_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+  block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+
+  initialize_block(block_A.host_view(), seed + 2021);
+  initialize_block(block_B.host_view(), seed + 2022);
+  initialize_block(block_C.host_view(), seed + 2023);
+  initialize_block(block_SFA.host_view(), seed + 2024);
+  initialize_block(block_SFB.host_view(), seed + 2025);
+
+  block_A.sync_device();
+  block_B.sync_device();
+  block_C.sync_device();
+  block_SFA.sync_device();
+  block_SFB.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, 1},
+    { // Mainloop arguments
+      block_A.device_data(), stride_A,
+      block_B.device_data(), stride_B,
+      block_SFA.device_data(), layout_SFA,
+      block_SFB.device_data(), layout_SFB
+    },
+    { // Epilogue arguments
+      {options.alpha, options.beta},
+      block_C.device_data(), stride_C,
+      block_D.device_data(), stride_D
+    }
+  };
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  using namespace cute;
+  // Create the arguments for host reference implementation
+  Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+  Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+  Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+  Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,                 // ElementAccumulator
+      decltype(tensor_A),                 // TensorA
+      decltype(tensor_SFA),               // TensorSfA
+      decltype(tensor_B),                 // TensorB
+      decltype(tensor_SFB)                // TensorSfB
+    > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+  auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+  auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+ 
+  cutlass::reference::host::GettBlockScalingEpilogueParams<
+      ElementAccumulator,                   // ElementScalar
+      ElementAccumulator,                   // ElementAccumulator
+      ElementAccumulator,                   // ElementCompute
+      decltype(tensor_C),                   // TensorC
+      decltype(tensor_D)                    // TensorD
+    > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
+
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // Comparison
+  block_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+  passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+  passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  cudaDeviceSynchronize();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+  // and must have compute capability at least 100.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  
+  if (!(props.major == 12 && props.minor == 0)) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
new file mode 100644
index 00000000..e3ebba4a
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
@@ -0,0 +1,593 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+    This example demonstrates a simple way to instantiate and run a blockscaled NVFP4 GEMM on the NVIDIA Blackwell SM120 architecture.
+    The kernel outputs quantized fp4 values with scale factors that will be the input of another GEMM.
+    This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+    Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+    
+    1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+    2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+    3. Block Scaled Tensor Core MMA Instructions
+    4. Epilogue Optimization
+
+    Note that GeForce RTX 50 series GPUs do not support:
+    1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+    2. Dynamic datatypes.
+    
+    Usage:
+
+      $ ./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include <iostream>
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::nv_float4_t<cutlass::float_e2m1_t>;    // Element type for A matrix operand
+using         LayoutATag  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 32;                                             // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::nv_float4_t<cutlass::float_e2m1_t>;    // Element type for B matrix operand
+using         LayoutBTag  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 32;                                             // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementD    = cutlass::float_e2m1_t;                          // Element type for D matrix operand
+using         ElementSFD  = cutlass::float_ue8m0_t;                         // Element type for SFD matrix operand
+using         ElementC    = cutlass::bfloat16_t;                            // Element type for C matrix operand
+using         LayoutCTag  = cutlass::layout::RowMajor;                      // Layout type for C matrix operand
+using         LayoutDTag  = cutlass::layout::RowMajor;                      // Layout type for D matrix operand
+using         LayoutSFDTag = LayoutDTag;                                    // Layout type for SFD should be same as D matrix operand
+
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm120;                           // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape    = Shape<_128,_128,_128>;                          // Threadblock's tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+
+constexpr int InputSFVectorSize  = 16;
+constexpr int OutputSFVectorSize = InputSFVectorSize;
+
+// D = alpha * acc + beta * C
+//      With BlockScaleFactor generation.
+using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    OutputSFVectorSize,
+    ElementD, 
+    ElementCompute, 
+    ElementSFD, LayoutSFDTag,
+    ElementC>;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,                      
+    ThreadBlockShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutCTag, AlignmentC,
+    ElementD, LayoutDTag, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto,                      // Epilogue schedule policy
+    FusionOperation
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutATag, AlignmentA,
+    ElementB, LayoutBTag, AlignmentB,
+    ElementAccumulator,
+    ThreadBlockShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::KernelTmaWarpSpecializedPingpong                           // Ping-pong kernel schedule policy.
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,                                                   // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference device GEMM implementation type
+using StrideA   = typename Gemm::GemmKernel::StrideA;
+using LayoutA   = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB   = typename Gemm::GemmKernel::StrideB;
+using LayoutB   = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC   = typename Gemm::GemmKernel::StrideC;
+using LayoutC   = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD   = typename Gemm::GemmKernel::StrideD;
+using LayoutD   = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+using FusionOp = typename Gemm::EpilogueOutputOp;
+constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
+using SfdOutputCfg = cutlass::detail::Sm1xxBlockScaledOutputConfig<OutputSFVectorSize>;
+using LayoutSFD = typename SfdOutputCfg::LayoutSF;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+LayoutSFD layout_SFD;
+
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor<ElementA::DataType, cutlass::layout::PackedVectorLayout> block_A;
+cutlass::HostTensor<ElementA::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFA;
+cutlass::HostTensor<ElementB::DataType, cutlass::layout::PackedVectorLayout> block_B;
+cutlass::HostTensor<ElementB::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFB;
+cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
+// Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
+cutlass::HostTensor<ElementSFD, cutlass::layout::PackedVectorLayout> block_SFD;
+
+// Reference Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
+cutlass::HostTensor<ElementSFD, cutlass::layout::PackedVectorLayout> block_reference_SFD;
+// Matrix-wide normalization constant
+cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k;
+
+  Options():
+    help(false),
+    m(1024), n(1024), k(1024),
+    alpha(1.f), beta(0.f),
+    iterations(10)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "79b_blackwell_geforce_nvfp4_nvfp4_gemm\n\n"
+      << "  Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ " << "./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_block(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if constexpr (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if constexpr (bits_input <= 6) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if constexpr (bits_input <= 8) {
+    if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
+      scope_max = 4;
+      scope_min = 1;
+    }
+    else {
+      scope_max = 1;
+      scope_min = -1;
+    }
+  }
+  else{
+    scope_max = 4;
+    scope_min = -4;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+  
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+  // For SFA and SFB tensors layouts
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  // For SFD tensor layout
+  using Sm1xxBlockScaledOutputConfig=  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+  layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+  layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+  layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+  layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFD = SfdOutputCfg::tile_atom_to_shape_SFD(cute::make_shape(options.m, options.n, options.k, 1));
+
+  block_A.reset(cutlass::make_Coord(size(layout_A)));
+  block_B.reset(cutlass::make_Coord(size(layout_B)));
+  block_C.reset(cutlass::make_Coord(size(layout_C)));
+  block_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_reference_SFD.reset(cutlass::make_Coord(size(filter_zeros(layout_SFD))));
+  block_Normconst.reset(cutlass::make_Coord(1));
+
+  block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+  block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+  block_SFD.reset(cutlass::make_Coord(size(filter_zeros(layout_SFD))));
+
+  initialize_block(block_A.host_view(), seed + 2021);
+  initialize_block(block_B.host_view(), seed + 2022);
+  initialize_block(block_C.host_view(), seed + 2023);
+  initialize_block(block_SFA.host_view(), seed + 2024);
+  initialize_block(block_SFB.host_view(), seed + 2025);
+  block_Normconst.at(cutlass::make_Coord(0)) = 2;
+
+  block_A.sync_device();
+  block_B.sync_device();
+  block_C.sync_device();
+  block_SFA.sync_device();
+  block_SFB.sync_device();
+  block_SFD.sync_device();
+  block_Normconst.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, 1},
+    { // Mainloop arguments
+      block_A.device_data(), stride_A,
+      block_B.device_data(), stride_B,
+      block_SFA.device_data(), layout_SFA,
+      block_SFB.device_data(), layout_SFB
+    },
+    { // Epilogue arguments
+      {options.alpha, options.beta},
+      block_C.device_data(), stride_C,
+      block_D.device_data(), stride_D
+    }
+  };
+
+  if constexpr (IsBlockScaleSupported) {
+    arguments.epilogue.thread.block_scale_factor_ptr = block_SFD.device_data();
+    arguments.epilogue.thread.norm_constant_ptr      = block_Normconst.device_data();
+  }
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  using namespace cute;
+  // Create the arguments for host reference implementation
+  Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+  Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+  Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+  Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,                 // ElementAccumulator
+      decltype(tensor_A),                 // TensorA
+      decltype(tensor_SFA),               // TensorSfA
+      decltype(tensor_B),                 // TensorB
+      decltype(tensor_SFB)                // TensorSfB
+    > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+  auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+  auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+  auto tensor_SFD = make_tensor(block_reference_SFD.host_data(), layout_SFD);
+ 
+  cutlass::reference::host::GettBlockScalingEpilogueParams<
+      ElementAccumulator,                   // ElementScalar
+      ElementAccumulator,                   // ElementAccumulator
+      ElementAccumulator,                   // ElementCompute
+      decltype(tensor_C),                   // TensorC
+      decltype(tensor_D),                   // TensorD
+      decltype(tensor_SFD),                 // TensorSfD
+      cute::Int<OutputSFVectorSize>,
+      cutlass::reference::host::SfStrategy::SfDGen
+    > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D, tensor_SFD, block_Normconst.at(cutlass::make_Coord(0))};
+
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // Comparison
+  block_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+  passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+  passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  cudaDeviceSynchronize();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+  // and must have compute capability at least 100.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  
+  if (!(props.major == 12 && props.minor == 0)) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
new file mode 100644
index 00000000..ac2f39c9
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
@@ -0,0 +1,546 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+    This example demonstrates a simple way to instantiate and run a mixed precision blockscaled GEMM on the NVIDIA Blackwell SM120 architecture.
+    This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+    The Blackwell SM120 CUTLASS kernel uses the new Block Scaled Tensor Core MMA Instructions (mma.sync.aligned.block_scale).
+    MXFP8 MMA has 2x throughput compared to Ada Tensor Core FP8 MMA. 
+    (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+
+    Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+    1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+    2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+    3. Block Scaled Tensor Core MMA Instructions
+    4. Epilogue Optimization
+
+    Note that GeForce RTX 50 series GPUs do not support:
+    1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+    2. Dynamic datatypes.
+    
+    Usage:
+
+      $ ./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_bf16_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include <iostream>
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::mx_float8_t<cutlass::float_e4m3_t>;    // Element type for A matrix operand
+using         LayoutATag  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 16;                                             // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::mx_float6_t<cutlass::float_e3m2_t>;    // Element type for B matrix operand
+using         LayoutBTag  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128;                                            // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                            // Element type for D matrix operand
+using         ElementC    = cutlass::bfloat16_t;                            // Element type for C matrix operand
+using         LayoutCTag  = cutlass::layout::RowMajor;                      // Layout type for C matrix operand
+using         LayoutDTag  = cutlass::layout::RowMajor;                      // Layout type for D matrix operand
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm120;                           // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape    = Shape<_128,_128,_128>;                          // Threadblock's tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,                      
+    ThreadBlockShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutCTag, AlignmentC,
+    ElementD, LayoutDTag, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto                      // Epilogue schedule policy
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutATag, AlignmentA,
+    ElementB, LayoutBTag, AlignmentB,
+    ElementAccumulator,
+    ThreadBlockShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto                             // Kernel schedule policy. Auto defaults to cooperative kernel schedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,                                                   // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference device GEMM implementation type
+using StrideA   = typename Gemm::GemmKernel::StrideA;
+using LayoutA   = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB   = typename Gemm::GemmKernel::StrideB;
+using LayoutB   = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;      // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC   = typename Gemm::GemmKernel::StrideC;
+using LayoutC   = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD   = typename Gemm::GemmKernel::StrideD;
+using LayoutD   = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor<ElementA::DataType, cutlass::layout::PackedVectorLayout> block_A;
+cutlass::HostTensor<ElementA::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFA;
+cutlass::HostTensor<ElementB::DataType, cutlass::layout::PackedVectorLayout> block_B;
+cutlass::HostTensor<ElementB::ScaleFactorType, cutlass::layout::PackedVectorLayout> block_SFB;
+cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
+// Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
+// Reference Output Tensor
+cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k;
+
+  Options():
+    help(false),
+    m(1024), n(1024), k(1024),
+    alpha(1.f), beta(0.f),
+    iterations(10)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "79c_blackwell_geforce_mixed_mxfp8_bf16_gemm\n\n"
+      << "  Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ " << "./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_bf16_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_block(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if constexpr (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if constexpr (bits_input <= 6) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if constexpr (bits_input <= 8) {
+    if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
+      scope_max = 4;
+      scope_min = 1;
+    }
+    else {
+      scope_max = 1;
+      scope_min = -1;
+    }
+  }
+  else{
+    scope_max = 4;
+    scope_min = -4;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+  
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  using namespace cute;
+  // For SFA and SFB tensors layouts
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+  layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+  layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+  layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+  layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+  layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+  layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+
+  block_A.reset(cutlass::make_Coord(size(layout_A)));
+  block_B.reset(cutlass::make_Coord(size(layout_B)));
+  block_C.reset(cutlass::make_Coord(size(layout_C)));
+  block_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+  block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+  block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+
+  initialize_block(block_A.host_view(), seed + 2021);
+  initialize_block(block_B.host_view(), seed + 2022);
+  initialize_block(block_C.host_view(), seed + 2023);
+  initialize_block(block_SFA.host_view(), seed + 2024);
+  initialize_block(block_SFB.host_view(), seed + 2025);
+
+  block_A.sync_device();
+  block_B.sync_device();
+  block_C.sync_device();
+  block_SFA.sync_device();
+  block_SFB.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, 1},
+    { // Mainloop arguments
+      block_A.device_data(), stride_A,
+      block_B.device_data(), stride_B,
+      block_SFA.device_data(), layout_SFA,
+      block_SFB.device_data(), layout_SFB
+    },
+    { // Epilogue arguments
+      {options.alpha, options.beta},
+      block_C.device_data(), stride_C,
+      block_D.device_data(), stride_D
+    }
+  };
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  using namespace cute;
+  // Create the arguments for host reference implementation
+  Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+  Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+  Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+  Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      ElementAccumulator,                 // ElementAccumulator
+      decltype(tensor_A),                 // TensorA
+      decltype(tensor_SFA),               // TensorSfA
+      decltype(tensor_B),                 // TensorB
+      decltype(tensor_SFB)                // TensorSfB
+    > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+  auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+  auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+ 
+  cutlass::reference::host::GettBlockScalingEpilogueParams<
+      ElementAccumulator,                   // ElementScalar
+      ElementAccumulator,                   // ElementAccumulator
+      ElementAccumulator,                   // ElementCompute
+      decltype(tensor_C),                   // TensorC
+      decltype(tensor_D)                    // TensorD
+    > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
+
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // Comparison
+  block_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+  passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+  passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  cudaDeviceSynchronize();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+  // and must have compute capability at least 100.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+    std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  
+  if (!(props.major == 12 && props.minor == 0)) {
+    std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  run<Gemm>(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/CMakeLists.txt b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
new file mode 100644
index 00000000..cb7e3e97
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+cutlass_example_add_executable(
+  79a_blackwell_geforce_nvfp4_bf16_gemm
+  79a_blackwell_geforce_nvfp4_bf16_gemm.cu
+)  
+
+cutlass_example_add_executable(
+  79b_blackwell_geforce_nvfp4_nvfp4_gemm
+  79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
+)  
+
+cutlass_example_add_executable(
+  79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm
+  79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
+)  
+
+endif()
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
index 417830f2..3148d2aa 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
@@ -216,7 +216,7 @@ struct Options {
 
     out
       << "\n\nExamples:\n\n"
-      << "$ " << "81_blackwell_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+      << "$ " << "112_blackwell_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
 
     return out;
   }
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a1a5c00a..0f03cd9b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -157,6 +157,7 @@ foreach(EXAMPLE
   76_blackwell_conv                             
   77_blackwell_fmha                             
   78_blackwell_emulated_bf16x9_gemm             
+  79_blackwell_geforce_gemm             
   81_blackwell_gemm_blockwise 
   )
 
diff --git a/examples/README.md b/examples/README.md
index 68bf7077..92779c07 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -282,6 +282,10 @@
 
     Blackwell SM100 FastFP32 (using BF16 to emulate SGEMM) kernel
 
+* [79_blackwell_geforce_gemm](79_blackwell_geforce_gemm/)
+
+    Blackwell SM120 MMA kernel targeting GeForce RTX 50 series CUDA Cores
+
 # CuTe - Programming Examples
 
 Examples that do not rely on CUTLASS and directly showcase the features of CuTe are located in [cutlass/examples/cute](./cute/).
@@ -291,3 +295,35 @@ Additionally, CuTe's core layout and layout algebra have their own test cases wi
 # Python Interface Examples
 
 Examples leveraging CUTLASS's [Python interface](../python/README.md) are located in [cutlass/examples/python](python/).
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/common/gather_tensor.hpp b/examples/common/gather_tensor.hpp
index 67ae811b..46fb6400 100644
--- a/examples/common/gather_tensor.hpp
+++ b/examples/common/gather_tensor.hpp
@@ -58,7 +58,7 @@ struct IndexedGather
   operator()(I i) const { return indices_[i]; }
 
   CUTE_HOST_DEVICE friend
-  void 
+  void
   print(IndexedGather const &s) {
     cute::print("Indexed");
   }
@@ -80,7 +80,7 @@ struct StridedGather
   operator()(I i) const { return i * stride_; }
 
   CUTE_HOST_DEVICE friend
-  void 
+  void
   print(StridedGather const &s) {
     cute::print("Strided{");
     print(s.stride_);
@@ -153,7 +153,7 @@ make_custom_stride_layout(Stride const &stride, Func&& func)
 /// Helper function to optionally create a gather tensor
 template<class Iterator, class Shape, class Stride, class Func>
 CUTLASS_HOST_DEVICE
-auto 
+auto
 make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
 {
   if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
@@ -180,7 +180,7 @@ upcast(Shape const& shape, Stride const& stride)
     return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
   } else if constexpr (is_scaled_basis<Stride>::value) {
     if constexpr (Stride::mode() == I) {
-      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
+      return make_layout(ceil_div(shape, Int<N>{}), ceil_div(stride, Int<N>{}));
     } else {
       return make_layout(shape, stride);
     }
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
index f263e5ce..3c9e93c4 100644
--- a/examples/cute/tutorial/CMakeLists.txt
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -27,34 +27,31 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+add_subdirectory(hopper)
+add_subdirectory(blackwell)
 
 cutlass_example_add_executable(
-  sgemm_1
+  cute_tutorial_sgemm_1
   sgemm_1.cu
 )
 
 cutlass_example_add_executable(
-  sgemm_2
+  cute_tutorial_sgemm_2
   sgemm_2.cu
 )
 
 cutlass_example_add_executable(
-  sgemm_sm70
+  cute_tutorial_sgemm_sm70
   sgemm_sm70.cu
 )
 
 cutlass_example_add_executable(
-  sgemm_sm80
+  cute_tutorial_sgemm_sm80
   sgemm_sm80.cu
 )
 
 cutlass_example_add_executable(
-  tiled_copy
+  cute_tutorial_tiled_copy
   tiled_copy.cu
 )
 
-cutlass_example_add_executable(
-  wgmma_sm90
-  wgmma_sm90.cu
-)
-
diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
new file mode 100644
index 00000000..3f73140a
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                             CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <cstdio>
+
+// Use Thrust to handle host/device allocations
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+// Cutlass includes
+#include <cutlass/half.h>                       // F16 data type
+#include <cutlass/util/print_error.hpp>
+#include <cutlass/arch/barrier.h>
+#include <cutlass/cluster_launch.hpp>
+
+// CuTe includes
+#include <cute/tensor.hpp>                      // CuTe tensor implementation
+#include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
+#include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
+#include <cute/algorithm/cooperative_copy.hpp>
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 01: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The goal of this tutorial is to show the CuTe interface for tcgen05.mma and tcgen05.ld operations.
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// This GEMM kernel performs the following steps:
+// 1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) for one MmaTile
+//    using auto-vectorizing copy operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to register (RMEM).
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from registers (RMEM) to global memory (GMEM).
+//
+// SM100 tcgen05.mma instructions operate as follows:
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+//   and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+//   These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 128x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t;  // MMA A Data Type
+// TypeB = cutlass::half_t;  // MMA B Data Type
+// TypeC = float;            // MMA C Data Type
+// TypeD = float;            // MMA D Data Type
+// TypeAccumulator = float;  // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A and B matrices.
+template <class TypeA,           // Tensor A data type
+          class TypeB,           // Tensor B data type
+          class ASmemLayout,     // (MmaA, NumMma_M, NumMma_K, ...)
+          class BSmemLayout>     // (MmaB, NumMma_N, NumMma_K, ...)
+struct SharedStorage
+{
+  alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
+  alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
+
+  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+
+  CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
+};
+
+// The device kernel
+template <class SharedStorage,
+          class ATensor, class BTensor, class CTensor, class DTensor,
+          class MmaTiler_MNK, class TiledMMA, class ClusterShape_MNK,
+          class Alpha, class Beta>
+__global__ static
+void
+gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
+            BTensor mB,                      // (Gemm_N, Gemm_K)
+            CTensor mC,                      // (Gemm_M, Gemm_N)
+            DTensor mD,                      // (Gemm_M, Gemm_N)
+            MmaTiler_MNK mma_tiler,          // <MmaTile_M, MmaTile_N, MmaTile_K>
+            TiledMMA tiled_mma,              // <    Mma_M,     Mma_N,     Mma_K>
+            ClusterShape_MNK cluster_shape,  // (ClusterM, ClusterN, ClusterK)
+            Alpha alpha, Beta beta)
+{
+  // Step 1: The Prologue.
+
+  // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename TiledMMA::AtomThrID{}));
+
+  // Construct the MMA grid coordinate from the CTA grid coordinate
+  auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+                                   blockIdx.x / size<0>(cluster_layout_vmnk), //    MMA-M coordinate
+                                   blockIdx.y,                                //    MMA-N coordinate
+                                   _);                                        //    MMA-K coordinate
+
+  // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+  //   by this mma tile.
+  // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+  //   * Tensor to partition
+  //   * Tiler to use for partitioning
+  //   * Coordinate to use for slicing the partitioned tensor
+  //   * Projection to ignore unwanted modes of the Tiler and Coordinate
+  auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+  Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{});  // (MmaTile_M, MmaTile_K, Tiles_K)
+  Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{});  // (MmaTile_N, MmaTile_K, Tiles_K)
+  Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+  Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+
+  if (thread0()) {
+    print("mA:\t"); print(mA); print("\n");   // mA:   gmem_ptr[16b](GMEM_ADDR_A) o (512,256):(256,_1)
+    print("mB:\t"); print(mB); print("\n");   // mB:   gmem_ptr[16b](GMEM_ADDR_B) o (1024,256):(256,_1)
+    print("mC:\t"); print(mC); print("\n");   // mC:   gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+    print("mD:\t"); print(mD); print("\n");   // mD:   gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+    print("gA:\t"); print(gA); print("\n");   // gA:   gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile) o (_128,_64,4):(256,_1,_64)
+    print("gB:\t"); print(gB); print("\n");   // gB:   gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile) o (_256,_64,4):(_1,256,16384)
+    print("gC:\t"); print(gC); print("\n");   // gC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+    print("gD:\t"); print(gD); print("\n");   // gD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+  } __syncthreads();
+
+  // The SMEM tensors
+
+  // Allocate SMEM
+  extern __shared__ char shared_memory[];
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Represent the SMEM buffers for A and B
+  Tensor tCsA = shared_storage.tensor_sA();         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCsB = shared_storage.tensor_sB();         // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  //
+  // Mma partitioning for A and B
+  //
+  // Note: Partitioned tensors use tXgY naming convention:
+  //  tXgY -> The partitioning pattern tX applied to tensor gY
+
+  auto mma_v = get<0>(mma_coord_vmnk);
+  ThrMMA cta_mma = tiled_mma.get_slice(mma_v);   // Use Peer CTA coordinate
+  Tensor tCgA = cta_mma.partition_A(gA);         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCgB = cta_mma.partition_B(gB);         // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+  Tensor tCgC = cta_mma.partition_C(gC);         // (MmaC, NumMma_M, NumMma_N)
+  Tensor tCgD = cta_mma.partition_C(gD);         // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCgA:\t"); print(tCgA); print("\n");  // tCgA:   gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile + offset_for_mma) o ((_128,_16),_1,_4,4):((256,_1),_0,_16,_64)
+    print("tCgB:\t"); print(tCgB); print("\n");  // tCgB:   gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile + offset_for_mma) o ((_256,_16),_1,_4,4):((_1,256),_0,4096,16384)
+    print("tCgC:\t"); print(tCgC); print("\n");  // tCgC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+    print("tCgD:\t"); print(tCgD); print("\n");  // tCgD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+  } __syncthreads();
+
+  // MMA Fragment Allocation
+  // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+  // For tcgen05.mma operations:
+  // - Matrices A and B are sourced from SMEM
+  // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+  // - The first mode of each descriptor represents the SMEM for a single MMA operation
+  Tensor tCrA = cta_mma.make_fragment_A(tCsA);      // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCrB = cta_mma.make_fragment_B(tCsB);      // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  // TMEM Allocation
+  // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+  // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+  Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCrA:\t"); print(tCrA); print("\n");     // tCrA:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCrB:\t"); print(tCrB); print("\n");     // tCrB:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+  } __syncthreads();
+
+  // Barrier Initialization
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  // Barriers in SMEM initialized by a single thread.
+  if (elect_one_warp && elect_one_thr) {
+    cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ 1);
+  }
+  int mma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  __syncthreads();                // Make sure all threads observe barrier initialization.
+
+  // Step 2: The Mainloop.
+
+  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+  for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+  {
+    // Step 2a: Load A and B tiles
+
+    // Using auto-vectorized copy operation:
+    // - Utilizes 128 threads for parallel data transfer
+    // - Copy operations are distributed efficiently across all threads
+    // - CuTe can automatically determine optimal vector width
+    cooperative_copy<128>(threadIdx.x, tCgA(_,_,_,k_tile), tCsA); // Load MmaTile_M x MmaTile_K A tile
+    cooperative_copy<128>(threadIdx.x, tCgB(_,_,_,k_tile), tCsB); // Load MmaTile_N x MmaTile_K B tile
+
+    // Step 2b: Execute the MMAs for this tile
+
+    // Wait for loads to SMEM to complete with __syncthreads()
+    __syncthreads();
+
+    // tcgen05.mma instructions require single-thread execution:
+    // - Only one warp performs the MMA-related loop operations
+    // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+    // - No explicit elect_one_sync region is needed from the user
+    if (elect_one_warp) {
+      // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+      cutlass::arch::umma_arrive(&shared_storage.mma_barrier);
+    }
+    // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+    cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+    mma_barrier_phase_bit ^= 1;
+  }
+
+  // Step 3: The Epilogue.
+
+  // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
+  ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
+
+  Tensor tDgC = thr_t2r_copy.partition_D(tCgC);                   // (CpyD, NumCpy_M, NumCpy_N)
+  Tensor tDrC = make_fragment_like(tDgC);                         // (CpyD, NumCpy_M, NumCpy_N)
+  // Load C tensor GMEM -> RMEM
+  copy(tDgC, tDrC);
+
+  Tensor tDtAcc = thr_t2r_copy.partition_S(tCtAcc);               // (CpyS, NumCpy_M, NumCpy_N)
+  Tensor tDgD   = thr_t2r_copy.partition_D(tCgD);                 // (CpyD, NumCpy_M, NumCpy_N)
+  using AccType = typename decltype(tCtAcc)::value_type;
+  Tensor tDrAcc = make_tensor<AccType>(shape(tDgD));              // (CpyD, NumCpy_M, NumCpy_N)
+  // Load TMEM -> RMEM
+  copy(tiled_t2r_copy, tDtAcc, tDrAcc);
+
+  // AXPBY RMEM -> RMEM: tDrC = alpha * tDrAcc + beta * tDrC
+  axpby(alpha, tDrAcc, beta, tDrC);
+  // Store RMEM -> GMEM
+  copy(tDrC, tDgD);
+}
+
+template <class TypeA, class LayoutA,
+          class TypeB, class LayoutB,
+          class TypeC, class LayoutC,
+          class TypeD, class LayoutD,
+          class Alpha, class Beta>
+void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
+                                   TypeB const* device_ptr_B, LayoutB layout_B,
+                                   TypeC const* device_ptr_C, LayoutC layout_C,
+                                   TypeD      * device_ptr_D, LayoutD layout_D,
+                                   Alpha const alpha, Beta const beta)
+{
+  assert(shape<0>(layout_A) == shape<0>(layout_C));  // Gemm_M
+  assert(shape<0>(layout_A) == shape<0>(layout_D));  // Gemm_M
+  assert(shape<0>(layout_B) == shape<1>(layout_C));  // Gemm_N
+  assert(shape<0>(layout_B) == shape<1>(layout_D));  // Gemm_N
+  assert(shape<1>(layout_A) == shape<1>(layout_B));  // Gemm_K
+
+  // Represent the full tensors in global memory
+  Tensor mA = make_tensor(make_gmem_ptr(device_ptr_A), layout_A);      // (Gemm_M, Gemm_K)
+  Tensor mB = make_tensor(make_gmem_ptr(device_ptr_B), layout_B);      // (Gemm_N, Gemm_K)
+  Tensor mC = make_tensor(make_gmem_ptr(device_ptr_C), layout_C);      // (Gemm_M, Gemm_N)
+  Tensor mD = make_tensor(make_gmem_ptr(device_ptr_D), layout_D);      // (Gemm_M, Gemm_N)
+
+  // Get M, N, K dimensions of the GEMM we are running
+  auto Gemm_M = shape<0>(layout_A);
+  auto Gemm_N = shape<0>(layout_B);
+  auto Gemm_K = shape<1>(layout_A);
+  std::cout << "Running for problem shape (MxNxK): " << Gemm_M << "x" << Gemm_N << "x" << Gemm_K << std::endl;
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Initialize the GEMM kernel parameters
+  //
+  ////////////////////////////////////////////////////////////
+
+  // Create TiledMma. make_tiled_mma takes the target instructions and an (optional) instruction layout as parameters to create a
+  // larger TiledMma from the given mma instruction.
+  // See cute/arch/mma_sm100_umma.hpp for all tcgen05.mma instructions
+  TiledMMA tiled_mma = make_tiled_mma(SM100_MMA_F16BF16_SS<TypeA, TypeB, TypeC,                 // Mma's A, B, and Accumulator types
+                                                           128, 256,                            // Mma M and N dimensions
+                                                           UMMA::Major::K, UMMA::Major::K>{});  // A and B layouts
+
+  // We can also print and inspect the tiled_mma
+  print(tiled_mma);
+  // TiledMMA
+  //   ThrLayoutVMNK:  (_1,_1,_1,_1):(_0,_0,_0,_0)
+  //   PermutationMNK: (_,_,_)
+  // MMA_Atom
+  //   ThrID:      _1:_0
+  //   Shape_MNK:  (_128,_256,_16)                      // MmaM, MmaN, MmaK instruction size
+  //   LayoutA_TV: (_1,(_128,_16)):(_0,(_1,_128))       // TV -> MmaCoordinate mapping for A matrix
+  //   LayoutB_TV: (_1,(_256,_16)):(_0,(_1,_256))       // TV -> MmaCoordinate mapping for B matrix
+  //   LayoutC_TV: (_1,(_128,_256)):(_0,(_1,_128))      // TV -> MmaCoordinate mapping for C matrix
+
+  // Define MMA tiler sizes (static)
+  auto bM = tile_size<0>(tiled_mma);             // MMA Tile M. We'll use 1 MMAs per MMA Tile M.
+  auto bN = tile_size<1>(tiled_mma);             // MMA Tile N. We'll use 1 MMAs per MMA Tile M.
+  auto bK = tile_size<2>(tiled_mma) * Int<4>{};  // MMA Tile K. We'll use 4 MMAs per MMA Tile K. For 16b types, tcgen05.mma has K16.
+  auto mma_tiler = make_shape(bM, bN, bK);       // (MMA_M, MMA_N, MMA_K)
+
+  // In SM90,  the MMAs are CTA-local and perform thread-level partitioning.
+  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
+  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
+  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
+  //  The MMA's partitioning then yeilds the CTA-local work.
+
+  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
+    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
+    return;
+  }
+
+  if (not evenly_divides(make_shape(Gemm_M, Gemm_N, Gemm_K), mma_tiler)) {
+    std::cerr << "OOB accesses are not supported. MmaTiler_MNK should evenly divide ProblemShape_MNK." << std::endl;
+    return;
+  }
+
+  //
+  // Determine the SMEM layouts:
+  //
+
+  //  * SMEM layouts for A and B must match the post-partitioned (CTA-local) shapes expected by the MMA instructions.
+  //  * CuTe provides partition_shape_[A|B] functions to determine the post-partitioned shape.
+  //    These functions take the TiledMma, and the MMA Tile Shape as inputs and returns a shape that is at least rank-3
+  //    where the first mode has the same shape as the MMA instruction, 2nd and 3rd mode expresses the number of time
+  //    MMA instr is repeated in M/N mode and K mode of MMA tile, respectively.
+  //  * Note that SMEM layouts are needed to determine SMEM allocation for kernel launch.
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_K) to post-partitioned (MmaA, NumMma_M, NumMma_K)
+  auto mma_shape_A = partition_shape_A(tiled_mma, make_shape(size<0>(mma_tiler), size<2>(mma_tiler)));
+  // Pre-partitioned Tile Shape (MmaTile_N, MmaTile_K) to post-partitioned (MmaB, NumMma_N, NumMma_K)
+  auto mma_shape_B = partition_shape_B(tiled_mma, make_shape(size<1>(mma_tiler), size<2>(mma_tiler)));
+
+  // Print and inspect mma_shape_A, and mma_shape_B for this example.
+  print("mma_shape_A:\t"); print(mma_shape_A); print("\n");  // mma_shape_A:  ((_128,_16),_1,_4)
+  print("mma_shape_B:\t"); print(mma_shape_B); print("\n");  // mma_shape_B:  ((_256,_16),_1,_4)
+
+  // A and B tensors are swizzled in SMEM to improve MMA performance.
+  //  * However, expressing swizzled layouts is very hard.
+  //  * CuTe provides tile_to_mma_shape functions for SM100 to create swizzled layouts for post-partitioned Mma Shapes
+  auto sA_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeA>{}, mma_shape_A);
+  auto sB_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeB>{}, mma_shape_B);
+
+  // Print and inspect sA_layout and sB_layout for this example.
+  print("sA_layout:\t"); print(sA_layout); print("\n");      // sA_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+  print("sB_layout:\t"); print(sB_layout); print("\n");      // sB_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+
+  // Now we can find the SMEM allocation size
+  using SMEMStorage = SharedStorage<TypeA, TypeB, decltype(sA_layout), decltype(sB_layout)>;
+
+  // The cluster shape and layout
+  auto cluster_shape = make_shape(Int<1>{}, Int<1>{}, Int<1>{});
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename decltype(tiled_mma)::AtomThrID{}));
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Launch GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  dim3 dimBlock(128);
+  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
+  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
+               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SMEMStorage);
+
+  auto* kernel_ptr = &gemm_device<SMEMStorage,
+                                  decltype(mA), decltype(mB), decltype(mC), decltype(mD),
+                                  decltype(mma_tiler), decltype(tiled_mma), decltype(cluster_shape),
+                                  Alpha, Beta>;
+
+  // Set kernel attributes (set SMEM)
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  printf("Grid launched: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
+  printf("Cluster launched: %d, %d, %d\n", dimCluster.x, dimCluster.y, dimCluster.z);
+
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             mA, mB, mC, mD,
+                                                             mma_tiler, tiled_mma, cluster_shape,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if ((props.major != 10) || (props.major == 10 && props.minor > 1)) {
+    std::cerr << "This example requires NVIDIA's Blackwell Architecture GPU with compute capability 100a." << std::endl;
+    std::cerr << "  Found " << props.major << "." << props.minor << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  int Gemm_M = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &Gemm_M);
+
+  int Gemm_N = 1024;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &Gemm_N);
+
+  int Gemm_K = 256;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &Gemm_K);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Create A, B, C, and D tensors
+  //
+  ////////////////////////////////////////////////////////////
+  // Define the data types. A and B types are same for MMA instruction.
+  using TypeA = cutlass::half_t; // MMA A Data Type
+  auto type_str_a = "half_t";
+  using TypeB = cutlass::half_t; // MMA B Data Type
+  auto type_str_b = "half_t";
+  using TypeC = float;           // MMA C Data Type
+  [[maybe_unused]] auto type_str_c = "float";
+  using TypeD = float;           // MMA D Data Type
+  auto type_str_d = "float";
+  using TypeAccumulator = float; // Both TypeC and TypeD are float, use float accumulator type.
+
+  // A tensor MxK K-major (Layout T = Row-Major)
+  Layout layout_A = make_layout(make_shape (Gemm_M,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_M,Gemm_K):(Gemm_K,_1)
+  // B tensor NxK K-major (Layout N = Column-Major)
+  Layout layout_B = make_layout(make_shape (Gemm_N,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_N,Gemm_K):(Gemm_K,_1)
+  // C tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_C = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+  // D tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_D = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+
+  // Host allocations and host CuTe tensors for A, B, and C tensors.
+  thrust::host_vector<TypeA>   host_A(Gemm_M * Gemm_K);
+  Tensor host_tensor_A = make_tensor(host_A.data(), layout_A);
+  print("host_tensor_A:\t"); print(host_tensor_A); print("\n"); // host_tensor_A:	ptr[16b](ADDR_A) o (512,256):(256,_1)
+
+  thrust::host_vector<TypeB>   host_B(Gemm_N * Gemm_K);
+  Tensor host_tensor_B = make_tensor(host_B.data(), layout_B);
+  print("host_tensor_B:\t"); print(host_tensor_B); print("\n"); // host_tensor_B:	ptr[16b](ADDR_B) o (1024,256):(256,_1)
+
+  thrust::host_vector<TypeC>   host_C(Gemm_M * Gemm_N);
+  Tensor host_tensor_C = make_tensor(host_C.data(), layout_C);
+  print("host_tensor_C:\t"); print(host_tensor_C); print("\n"); // host_tensor_C:	ptr[32b](ADDR_C) o (512,1024):(1024,_1)
+
+  // Note that we don't need a host_tensor for D yet.
+  thrust::device_vector<TypeD> device_D(Gemm_M * Gemm_N);
+
+  // Initialize A, B, and C tensors with random values.
+  initialize_tensor(host_tensor_A);
+  initialize_tensor(host_tensor_B);
+  initialize_tensor(host_tensor_C);
+
+  // Copy A, B, and C tensors from host memory to device memory
+  thrust::device_vector<TypeA> device_A = host_A;
+  thrust::device_vector<TypeB> device_B = host_B;
+  thrust::device_vector<TypeC> device_C = host_C;
+
+  using Alpha = float;
+  using Beta = float;
+  Alpha alpha = 1.0f;
+  Beta beta = 0.0f;
+  // Setup input and output tensors, and the kernel parameters; and execute the kernel on device
+  gemm_host_f16xf16_f32_f32_tnt(device_A.data().get(), layout_A,
+                                device_B.data().get(), layout_B,
+                                device_C.data().get(), layout_C,
+                                device_D.data().get(), layout_D,
+                                alpha, beta);
+  // Host allocation for D tensor and transfer D tensor from device to host
+  thrust::host_vector<TypeD> host_D = device_D;
+  // Create a non-owning CuTe tensor for D tensor
+  Tensor host_tensor_D = make_tensor(host_D.data(), layout_D);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Execute reference GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  thrust::host_vector<TypeD> host_reference_D(Gemm_M*Gemm_N);
+  auto host_reference_tensor_D = make_tensor(host_reference_D.data(), layout_D);
+  reference_gemm<TypeAccumulator>(host_tensor_A, host_tensor_B, host_tensor_C, host_reference_tensor_D, alpha, beta);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Compare results
+  //
+  ////////////////////////////////////////////////////////////
+
+  auto relative_error = print_matrix_multiply_mollified_relative_error(type_str_a, host_tensor_A,
+                                                                       type_str_b, host_tensor_B,
+                                                                       type_str_d, host_tensor_D, host_reference_tensor_D);
+  bool success = relative_error <= 0.0;
+  std::cout << "Execution is " << ((success) ? "successful." : "failed.") << std::endl;
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
new file mode 100644
index 00000000..e508e552
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
@@ -0,0 +1,671 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                             CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <cstdio>
+
+// Use Thrust to handle host/device allocations
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+// Cutlass includes
+#include <cutlass/half.h>                       // F16 data type
+#include <cutlass/util/print_error.hpp>
+#include <cutlass/arch/barrier.h>
+#include <cutlass/cluster_launch.hpp>
+
+// CuTe includes
+#include <cute/tensor.hpp>                      // CuTe tensor implementation
+#include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
+#include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
+#include <cute/algorithm/cooperative_copy.hpp>
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 02: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// This GEMM kernel extends 01_mma_sm100.cu by adding Tensor Memory Access (TMA) and performs the following steps:
+// 1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to register (RMEM).
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from registers (RMEM) to global memory (GMEM).
+//
+// SM100 tcgen05.mma instructions operate as follows:
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+//   and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+//   These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 128x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t;  // MMA A Data Type
+// TypeB = cutlass::half_t;  // MMA B Data Type
+// TypeC = float;            // MMA C Data Type
+// TypeD = float;            // MMA D Data Type
+// TypeAccumulator = float;  // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A and B matrices.
+template <class TypeA,           // Tensor A data type
+          class TypeB,           // Tensor B data type
+          class ASmemLayout,     // (MmaA, NumMma_M, NumMma_K, ...)
+          class BSmemLayout>     // (MmaB, NumMma_N, NumMma_K, ...)
+struct SharedStorage
+{
+  alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
+  alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
+
+  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+  alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
+
+  CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
+};
+
+// The device kernel
+template <class SharedStorage,
+          class ATensor, class BTensor, class CTensor, class DTensor,
+          class MmaTiler_MNK, class TiledMMA, class ClusterShape_MNK,
+          class TmaAtomA, class TmaAtomB,
+          class Alpha, class Beta>
+__global__ static
+void
+gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
+            BTensor mB,                      // (Gemm_N, Gemm_K)
+            CTensor mC,                      // (Gemm_M, Gemm_N)
+            DTensor mD,                      // (Gemm_M, Gemm_N)
+            MmaTiler_MNK mma_tiler,          // <MmaTile_M, MmaTile_N, MmaTile_K>
+            TiledMMA tiled_mma,              // <    Mma_M,     Mma_N,     Mma_K>
+            ClusterShape_MNK cluster_shape,  // (ClusterM, ClusterN, ClusterK)
+            CUTE_GRID_CONSTANT TmaAtomA const tma_atom_A,
+            CUTE_GRID_CONSTANT TmaAtomB const tma_atom_B,
+            Alpha alpha, Beta beta)
+{
+  // Step 1: The Prologue.
+
+  // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename TiledMMA::AtomThrID{}));
+
+  // Construct the MMA grid coordinate from the CTA grid coordinate
+  auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+                                   blockIdx.x / size<0>(cluster_layout_vmnk), //    MMA-M coordinate
+                                   blockIdx.y,                                //    MMA-N coordinate
+                                   _);                                        //    MMA-K coordinate
+
+  // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+  //   by this mma tile.
+  // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+  //   * Tensor to partition
+  //   * Tiler to use for partitioning
+  //   * Coordinate to use for slicing the partitioned tensor
+  //   * Projection to ignore unwanted modes of the Tiler and Coordinate
+  auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+  Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{});  // (MmaTile_M, MmaTile_K, Tiles_K)
+  Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{});  // (MmaTile_N, MmaTile_K, Tiles_K)
+  Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+  Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+
+  if (thread0()) {
+    print("mA:\t"); print(mA); print("\n");   // mA:   ArithTuple(_0,_0) o (512,256):(_1@1,_1@0)
+    print("mB:\t"); print(mB); print("\n");   // mB:   ArithTuple(_0,_0) o (1024,256):(_1@1,_1@0)
+    print("mC:\t"); print(mC); print("\n");   // mC:   gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+    print("mD:\t"); print(mD); print("\n");   // mD:   gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+    print("gA:\t"); print(gA); print("\n");   // gA:   ArithTuple(_0,0) o (_128,_64,4):(_1@1,_1@0,_64@0)
+    print("gB:\t"); print(gB); print("\n");   // gB:   ArithTuple(_0,0) o (_256,_64,4):(_1@1,_1@0,_64@0)
+    print("gC:\t"); print(gC); print("\n");   // gC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+    print("gD:\t"); print(gD); print("\n");   // gD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+  } __syncthreads();
+
+  // The SMEM tensors
+
+  // Allocate SMEM
+  extern __shared__ char shared_memory[];
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Represent the SMEM buffers for A and B
+  Tensor tCsA = shared_storage.tensor_sA();         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCsB = shared_storage.tensor_sB();         // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  //
+  // Mma partitioning for A and B
+  //
+  // Note: Partitioned tensors use tXgY naming convention:
+  //  tXgY -> The partitioning pattern tX applied to tensor gY
+
+  auto mma_v = get<0>(mma_coord_vmnk);
+  ThrMMA cta_mma = tiled_mma.get_slice(mma_v);   // Use Peer CTA coordinate
+  Tensor tCgA = cta_mma.partition_A(gA);         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCgB = cta_mma.partition_B(gB);         // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+  Tensor tCgC = cta_mma.partition_C(gC);         // (MmaC, NumMma_M, NumMma_N)
+  Tensor tCgD = cta_mma.partition_C(gD);         // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCgA:\t"); print(tCgA); print("\n");  // tCgA:   ArithTuple(_0,0) o ((_128,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgB:\t"); print(tCgB); print("\n");  // tCgB:   ArithTuple(_0,0) o ((_256,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgC:\t"); print(tCgC); print("\n");  // tCgC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+    print("tCgD:\t"); print(tCgD); print("\n");  // tCgD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+  } __syncthreads();
+
+  // MMA Fragment Allocation
+  // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+  // For tcgen05.mma operations:
+  // - Matrices A and B are sourced from SMEM
+  // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+  // - The first mode of each descriptor represents the SMEM for a single MMA operation
+  Tensor tCrA = cta_mma.make_fragment_A(tCsA);      // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCrB = cta_mma.make_fragment_B(tCsB);      // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  // TMEM Allocation
+  // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+  // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+  Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCrA:\t"); print(tCrA); print("\n");     // tCrA:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCrB:\t"); print(tCrB); print("\n");     // tCrB:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+  } __syncthreads();
+
+  // TMA Setup
+  //
+  //   These are TMA partitionings, which have a dedicated custom partitioner.
+  //   The Int<0>, Layout<_1> indicates that the TMAs are not multicasted.
+  //      Any multicasting must be in conformance with tma_x constructed with make_tma_atom on host.
+  //   For A tensor: The group_modes<0,3> transforms the (MmaA, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaA, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile MK.
+  //   For B tensor: The group_modes<0,3> transforms the (MmaB, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaB, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile NK.
+  //   Simply put, the TMA will be responsible for everything in mode-0 with a single call to cute::copy.
+  //   The tma_partition reorders and offsets mode-0 according to the tma_x atom and the multicast info.
+
+  auto [tAgA, tAsA] = tma_partition(tma_atom_A,
+                                    Int<0>{}, Layout<_1>{},
+                                    group_modes<0,3>(tCsA), group_modes<0,3>(tCgA));
+
+  auto [tBgB, tBsB] = tma_partition(tma_atom_B,
+                                    Int<0>{}, Layout<_1>{},
+                                    group_modes<0,3>(tCsB), group_modes<0,3>(tCgB));
+
+  // Calculate total bytes that TMA will transfer each tile to track completion
+  int tma_transaction_bytes = sizeof(make_tensor_like(tAsA))
+                            + sizeof(make_tensor_like(tBsB));
+
+  if (thread0()) {
+    print("tAgA:\t"); print(tAgA); print("\n");  // tAgA:   ArithTuple(_0,0) o (((_64,_128),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tAsA:\t"); print(tAsA); print("\n");  // tAsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_8192,_1)):((_1,_0))
+    print("tBgB:\t"); print(tBgB); print("\n");  // tBgB:   ArithTuple(_0,0) o (((_64,_256),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tBsB:\t"); print(tBsB); print("\n");  // tBsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_16384,_1)):((_1,_0))
+    printf("TmaBytes: %d\n", tma_transaction_bytes);
+  } __syncthreads();
+
+  // Barrier Initialization
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  // Barriers in SMEM initialized by a single thread.
+  if (elect_one_warp && elect_one_thr) {
+    cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ 1);
+    cute::initialize_barrier(shared_storage.tma_barrier, /* num_threads */ 1);
+  }
+  int mma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  int tma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  __syncthreads();                // Make sure all threads observe barrier initialization.
+
+  // Step 2: The Mainloop.
+
+  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+  for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+  {
+    // Step 2a: Load A and B tiles
+
+    // TMA Load Operations:
+    // - Execute asynchronous TMA loads with single thread
+    // - Set transaction bytes and execute with barrier
+    if (elect_one_warp && elect_one_thr) {
+      cute::set_barrier_transaction_bytes(shared_storage.tma_barrier, tma_transaction_bytes);
+      copy(tma_atom_A.with(shared_storage.tma_barrier), tAgA(_,k_tile), tAsA); // Load MmaTile_M x MmaTile_K A tile
+      copy(tma_atom_B.with(shared_storage.tma_barrier), tBgB(_,k_tile), tBsB); // Load MmaTile_N x MmaTile_K B tile
+    }
+
+    // Step 2b: Execute the MMAs for this tile
+
+    // Wait for TMA loads to SMEM to complete
+    cute::wait_barrier(shared_storage.tma_barrier, tma_barrier_phase_bit);
+    tma_barrier_phase_bit ^= 1;
+
+    // tcgen05.mma instructions require single-thread execution:
+    // - Only one warp performs the MMA-related loop operations
+    // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+    // - No explicit elect_one_sync region is needed from the user
+    if (elect_one_warp) {
+      // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+      cutlass::arch::umma_arrive(&shared_storage.mma_barrier);
+    }
+    // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+    cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+    mma_barrier_phase_bit ^= 1;
+  }
+
+  // Step 3: The Epilogue.
+
+  // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
+  ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
+
+  Tensor tDgC = thr_t2r_copy.partition_D(tCgC);                   // (CpyD, NumCpy_M, NumCpy_N)
+  Tensor tDrC = make_fragment_like(tDgC);                         // (CpyD, NumCpy_M, NumCpy_N)
+  // Load C tensor GMEM -> RMEM
+  copy(tDgC, tDrC);
+
+  Tensor tDtAcc = thr_t2r_copy.partition_S(tCtAcc);               // (CpyS, NumCpy_M, NumCpy_N)
+  Tensor tDgD   = thr_t2r_copy.partition_D(tCgD);                 // (CpyD, NumCpy_M, NumCpy_N)
+  using AccType = typename decltype(tCtAcc)::value_type;
+  Tensor tDrAcc = make_tensor<AccType>(shape(tDgD));              // (CpyD, NumCpy_M, NumCpy_N)
+  // Load TMEM -> RMEM
+  copy(tiled_t2r_copy, tDtAcc, tDrAcc);
+
+  // AXPBY RMEM -> RMEM: tDrC = alpha * tDrAcc + beta * tDrC
+  axpby(alpha, tDrAcc, beta, tDrC);
+  // Store RMEM -> GMEM
+  copy(tDrC, tDgD);
+}
+
+template <class TypeA, class LayoutA,
+          class TypeB, class LayoutB,
+          class TypeC, class LayoutC,
+          class TypeD, class LayoutD,
+          class Alpha, class Beta>
+void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
+                                   TypeB const* device_ptr_B, LayoutB layout_B,
+                                   TypeC const* device_ptr_C, LayoutC layout_C,
+                                   TypeD      * device_ptr_D, LayoutD layout_D,
+                                   Alpha const alpha, Beta const beta)
+{
+  assert(shape<0>(layout_A) == shape<0>(layout_C));  // Gemm_M
+  assert(shape<0>(layout_A) == shape<0>(layout_D));  // Gemm_M
+  assert(shape<0>(layout_B) == shape<1>(layout_C));  // Gemm_N
+  assert(shape<0>(layout_B) == shape<1>(layout_D));  // Gemm_N
+  assert(shape<1>(layout_A) == shape<1>(layout_B));  // Gemm_K
+
+  // Represent the full tensors in global memory
+  Tensor mA = make_tensor(make_gmem_ptr(device_ptr_A), layout_A);      // (Gemm_M, Gemm_K)
+  Tensor mB = make_tensor(make_gmem_ptr(device_ptr_B), layout_B);      // (Gemm_N, Gemm_K)
+  Tensor mC = make_tensor(make_gmem_ptr(device_ptr_C), layout_C);      // (Gemm_M, Gemm_N)
+  Tensor mD = make_tensor(make_gmem_ptr(device_ptr_D), layout_D);      // (Gemm_M, Gemm_N)
+
+  // Get M, N, K dimensions of the GEMM we are running
+  auto Gemm_M = shape<0>(layout_A);
+  auto Gemm_N = shape<0>(layout_B);
+  auto Gemm_K = shape<1>(layout_A);
+  std::cout << "Running for problem shape (MxNxK): " << Gemm_M << "x" << Gemm_N << "x" << Gemm_K << std::endl;
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Initialize the GEMM kernel parameters
+  //
+  ////////////////////////////////////////////////////////////
+
+  // Create TiledMma. make_tiled_mma takes the target instructions and an (optional) instruction layout as parameters to create a
+  // larger TiledMma from the given mma instruction.
+  // See cute/arch/mma_sm100_umma.hpp for all tcgen05.mma instructions
+  TiledMMA tiled_mma = make_tiled_mma(SM100_MMA_F16BF16_SS<TypeA, TypeB, TypeC,                 // Mma's A, B, and Accumulator types
+                                                           128, 256,                            // Mma M and N dimensions
+                                                           UMMA::Major::K, UMMA::Major::K>{});  // A and B layouts
+
+  // We can also print and inspect the tiled_mma
+  print(tiled_mma);
+  // TiledMMA
+  //   ThrLayoutVMNK:  (_1,_1,_1,_1):(_0,_0,_0,_0)
+  //   PermutationMNK: (_,_,_)
+  // MMA_Atom
+  //   ThrID:      _1:_0
+  //   Shape_MNK:  (_128,_256,_16)                      // MmaM, MmaN, MmaK instruction size
+  //   LayoutA_TV: (_1,(_128,_16)):(_0,(_1,_128))       // TV -> MmaCoordinate mapping for A matrix
+  //   LayoutB_TV: (_1,(_256,_16)):(_0,(_1,_256))       // TV -> MmaCoordinate mapping for B matrix
+  //   LayoutC_TV: (_1,(_128,_256)):(_0,(_1,_128))      // TV -> MmaCoordinate mapping for C matrix
+
+  // Define MMA tiler sizes (static)
+  auto bM = tile_size<0>(tiled_mma);             // MMA Tile M. We'll use 1 MMAs per MMA Tile M.
+  auto bN = tile_size<1>(tiled_mma);             // MMA Tile N. We'll use 1 MMAs per MMA Tile M.
+  auto bK = tile_size<2>(tiled_mma) * Int<4>{};  // MMA Tile K. We'll use 4 MMAs per MMA Tile K. For 16b types, tcgen05.mma has K16.
+  auto mma_tiler = make_shape(bM, bN, bK);       // (MMA_M, MMA_N, MMA_K)
+
+  // In SM90,  the MMAs are CTA-local and perform thread-level partitioning.
+  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
+  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
+  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
+  //  The MMA's partitioning then yeilds the CTA-local work.
+
+  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
+    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
+    return;
+  }
+
+  if (not evenly_divides(make_shape(Gemm_M, Gemm_N, Gemm_K), mma_tiler)) {
+    std::cerr << "OOB accesses are not supported. MmaTiler_MNK should evenly divide ProblemShape_MNK." << std::endl;
+    return;
+  }
+
+  //
+  // Determine the SMEM layouts:
+  //
+
+  //  * SMEM layouts for A and B must match the post-partitioned (CTA-local) shapes expected by the MMA instructions.
+  //  * CuTe provides partition_shape_[A|B] functions to determine the post-partitioned shape.
+  //    These functions take the TiledMma, and the MMA Tile Shape as inputs and returns a shape that is at least rank-3
+  //    where the first mode has the same shape as the MMA instruction, 2nd and 3rd mode expresses the number of time
+  //    MMA instr is repeated in M/N mode and K mode of MMA tile, respectively.
+  //  * Note that SMEM layouts are needed to determine SMEM allocation for kernel launch.
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_K) to post-partitioned (MmaA, NumMma_M, NumMma_K)
+  auto mma_shape_A = partition_shape_A(tiled_mma, make_shape(size<0>(mma_tiler), size<2>(mma_tiler)));
+  // Pre-partitioned Tile Shape (MmaTile_N, MmaTile_K) to post-partitioned (MmaB, NumMma_N, NumMma_K)
+  auto mma_shape_B = partition_shape_B(tiled_mma, make_shape(size<1>(mma_tiler), size<2>(mma_tiler)));
+
+  // Print and inspect mma_shape_A, and mma_shape_B for this example.
+  print("mma_shape_A:\t"); print(mma_shape_A); print("\n");  // mma_shape_A:  ((_128,_16),_1,_4)
+  print("mma_shape_B:\t"); print(mma_shape_B); print("\n");  // mma_shape_B:  ((_256,_16),_1,_4)
+
+  // A and B tensors are swizzled in SMEM to improve MMA performance.
+  //  * However, expressing swizzled layouts is very hard.
+  //  * CuTe provides tile_to_mma_shape functions for SM100 to create swizzled layouts for post-partitioned Mma Shapes
+  auto sA_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeA>{}, mma_shape_A);
+  auto sB_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeB>{}, mma_shape_B);
+
+  // Print and inspect sA_layout and sB_layout for this example.
+  print("sA_layout:\t"); print(sA_layout); print("\n");      // sA_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+  print("sB_layout:\t"); print(sB_layout); print("\n");      // sB_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+
+  // Now we can find the SMEM allocation size
+  using SMEMStorage = SharedStorage<TypeA, TypeB, decltype(sA_layout), decltype(sB_layout)>;
+
+  //
+  // TMA Descriptor Creation (Host Side)
+  //
+
+  // The cluster shape and layout
+  auto cluster_shape = make_shape(Int<1>{}, Int<1>{}, Int<1>{});
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename decltype(tiled_mma)::AtomThrID{}));
+
+  // Create TMA descriptors for A and B matrices
+  Copy_Atom tma_atom_A = make_tma_atom(
+    SM90_TMA_LOAD{},        // TMA Load Op
+    mA,                     // Source GMEM tensor
+    sA_layout,              // Destination SMEM layout
+    select<0,2>(mma_tiler)  // MK Tiler for TMA operation
+  );
+  Tensor mA_tma = tma_atom_A.get_tma_tensor(shape(mA));   // (Gemm_M, Gemm_K)
+
+  print("tma_atom_A:\t"); print(tma_atom_A); print("\n");
+  // tma_atom_A:     Copy_Atom
+  //  ThrID:        _1:_0
+  //  ValLayoutSrc: (_1,_8192):(_0,_1)
+  //  ValLayoutDst: (_1,_8192):(_0,_1)
+  //  ValLayoutRef: (_1,_8192):(_0,_1)
+  //  ValueType:    16b
+
+  Copy_Atom tma_atom_B = make_tma_atom(
+      SM90_TMA_LOAD{},        // TMA Load Op
+      mB,                     // Source GMEM tensor
+      sB_layout,              // Destination SMEM layout
+      select<1,2>(mma_tiler)  // NK Tiler for TMA operation
+    );
+  Tensor mB_tma = tma_atom_B.get_tma_tensor(shape(mB));   // (Gemm_N, Gemm_K)
+
+  print("tma_atom_B:\t"); print(tma_atom_B); print("\n");
+  // tma_atom_B:     Copy_Atom
+  //  ThrID:        _1:_0
+  //  ValLayoutSrc: (_1,_16384):(_0,_1)
+  //  ValLayoutDst: (_1,_16384):(_0,_1)
+  //  ValLayoutRef: (_1,_16384):(_0,_1)
+  //  ValueType:    16b
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Launch GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  dim3 dimBlock(128);
+  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
+  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
+               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SMEMStorage);
+
+  auto* kernel_ptr = &gemm_device<SMEMStorage,
+                                  decltype(mA_tma), decltype(mB_tma), decltype(mC), decltype(mD),
+                                  decltype(mma_tiler), decltype(tiled_mma), decltype(cluster_shape),
+                                  decltype(tma_atom_A), decltype(tma_atom_B), // Includes the TMA descriptor.
+                                  Alpha, Beta>;
+
+  // Set kernel attributes (set SMEM)
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  printf("Grid launched: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
+  printf("Cluster launched: %d, %d, %d\n", dimCluster.x, dimCluster.y, dimCluster.z);
+
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             mA_tma, mB_tma, mC, mD,
+                                                             mma_tiler, tiled_mma, cluster_shape,
+                                                             tma_atom_A, tma_atom_B,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if ((props.major != 10) || (props.major == 10 && props.minor > 1)) {
+    std::cerr << "This example requires NVIDIA's Blackwell Architecture GPU with compute capability 100a." << std::endl;
+    std::cerr << "  Found " << props.major << "." << props.minor << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  int Gemm_M = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &Gemm_M);
+
+  int Gemm_N = 1024;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &Gemm_N);
+
+  int Gemm_K = 256;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &Gemm_K);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Create A, B, C, and D tensors
+  //
+  ////////////////////////////////////////////////////////////
+  // Define the data types. A and B types are same for MMA instruction.
+  using TypeA = cutlass::half_t; // MMA A Data Type
+  auto type_str_a = "half_t";
+  using TypeB = cutlass::half_t; // MMA B Data Type
+  auto type_str_b = "half_t";
+  using TypeC = float;           // MMA C Data Type
+  [[maybe_unused]] auto type_str_c = "float";
+  using TypeD = float;           // MMA D Data Type
+  auto type_str_d = "float";
+  using TypeAccumulator = float; // Both TypeC and TypeD are float, use float accumulator type.
+
+  // A tensor MxK K-major (Layout T = Row-Major)
+  Layout layout_A = make_layout(make_shape (Gemm_M,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_M,Gemm_K):(Gemm_K,_1)
+  // B tensor NxK K-major (Layout N = Column-Major)
+  Layout layout_B = make_layout(make_shape (Gemm_N,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_N,Gemm_K):(Gemm_K,_1)
+  // C tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_C = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+  // D tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_D = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+
+  // Host allocations and host CuTe tensors for A, B, and C tensors.
+  thrust::host_vector<TypeA>   host_A(Gemm_M * Gemm_K);
+  Tensor host_tensor_A = make_tensor(host_A.data(), layout_A);
+  print("host_tensor_A:\t"); print(host_tensor_A); print("\n"); // host_tensor_A:	ptr[16b](ADDR_A) o (512,256):(256,_1)
+
+  thrust::host_vector<TypeB>   host_B(Gemm_N * Gemm_K);
+  Tensor host_tensor_B = make_tensor(host_B.data(), layout_B);
+  print("host_tensor_B:\t"); print(host_tensor_B); print("\n"); // host_tensor_B:	ptr[16b](ADDR_B) o (1024,256):(256,_1)
+
+  thrust::host_vector<TypeC>   host_C(Gemm_M * Gemm_N);
+  Tensor host_tensor_C = make_tensor(host_C.data(), layout_C);
+  print("host_tensor_C:\t"); print(host_tensor_C); print("\n"); // host_tensor_C:	ptr[32b](ADDR_C) o (512,1024):(1024,_1)
+
+  // Note that we don't need a host_tensor for D yet.
+  thrust::device_vector<TypeD> device_D(Gemm_M * Gemm_N);
+
+  // Initialize A, B, and C tensors with random values.
+  initialize_tensor(host_tensor_A);
+  initialize_tensor(host_tensor_B);
+  initialize_tensor(host_tensor_C);
+
+  // Copy A, B, and C tensors from host memory to device memory
+  thrust::device_vector<TypeA> device_A = host_A;
+  thrust::device_vector<TypeB> device_B = host_B;
+  thrust::device_vector<TypeC> device_C = host_C;
+
+  using Alpha = float;
+  using Beta = float;
+  Alpha alpha = 1.0f;
+  Beta beta = 0.0f;
+  // Setup input and output tensors, and the kernel parameters; and execute the kernel on device
+  gemm_host_f16xf16_f32_f32_tnt(device_A.data().get(), layout_A,
+                                device_B.data().get(), layout_B,
+                                device_C.data().get(), layout_C,
+                                device_D.data().get(), layout_D,
+                                alpha, beta);
+  // Host allocation for D tensor and transfer D tensor from device to host
+  thrust::host_vector<TypeD> host_D = device_D;
+  // Create a non-owning CuTe tensor for D tensor
+  Tensor host_tensor_D = make_tensor(host_D.data(), layout_D);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Execute reference GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  thrust::host_vector<TypeD> host_reference_D(Gemm_M*Gemm_N);
+  auto host_reference_tensor_D = make_tensor(host_reference_D.data(), layout_D);
+  reference_gemm<TypeAccumulator>(host_tensor_A, host_tensor_B, host_tensor_C, host_reference_tensor_D, alpha, beta);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Compare results
+  //
+  ////////////////////////////////////////////////////////////
+
+  auto relative_error = print_matrix_multiply_mollified_relative_error(type_str_a, host_tensor_A,
+                                                                       type_str_b, host_tensor_B,
+                                                                       type_str_d, host_tensor_D, host_reference_tensor_D);
+  bool success = relative_error <= 0.0;
+  std::cout << "Execution is " << ((success) ? "successful." : "failed.") << std::endl;
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
new file mode 100644
index 00000000..1c2538e3
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
@@ -0,0 +1,711 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                             CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <cstdio>
+
+// Use Thrust to handle host/device allocations
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+// Cutlass includes
+#include <cutlass/half.h>                       // F16 data type
+#include <cutlass/util/print_error.hpp>
+#include <cutlass/arch/barrier.h>
+#include <cutlass/cluster_launch.hpp>
+
+// CuTe includes
+#include <cute/tensor.hpp>                      // CuTe tensor implementation
+#include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
+#include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
+#include <cute/algorithm/cooperative_copy.hpp>
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 03: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// Key extensions from tutorial 02_mma_tma_sm100.cu:
+// 1. Introduce ClusterShape for coordinated execution across thread blocks
+// 2. Introduce TMA multicast
+// 3. Enhanced TMA <-> MMA synchronization for cluster-wide operations
+//
+// This GEMM kernel will perform the following steps:
+// 1. Load A and B matrices from GMEM to SMEM using Multicasted TMA load operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to register (RMEM).
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from registers (RMEM) to global memory (GMEM).
+//
+// SM100 tcgen05.mma instructions operate as follows:
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+//   and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+//   These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 128x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t;  // MMA A Data Type
+// TypeB = cutlass::half_t;  // MMA B Data Type
+// TypeC = float;            // MMA C Data Type
+// TypeD = float;            // MMA D Data Type
+// TypeAccumulator = float;  // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A and B matrices.
+template <class TypeA,           // Tensor A data type
+          class TypeB,           // Tensor B data type
+          class ASmemLayout,     // (MmaA, NumMma_M, NumMma_K, ...)
+          class BSmemLayout>     // (MmaB, NumMma_N, NumMma_K, ...)
+struct SharedStorage
+{
+  alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
+  alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
+
+  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+  alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
+
+  CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
+};
+
+// The device kernel
+template <class SharedStorage,
+          class ATensor, class BTensor, class CTensor, class DTensor,
+          class MmaTiler_MNK, class TiledMMA, class ClusterShape_MNK,
+          class TmaAtomA, class TmaAtomB,
+          class Alpha, class Beta>
+__global__ static
+void
+gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
+            BTensor mB,                      // (Gemm_N, Gemm_K)
+            CTensor mC,                      // (Gemm_M, Gemm_N)
+            DTensor mD,                      // (Gemm_M, Gemm_N)
+            MmaTiler_MNK mma_tiler,          // <MmaTile_M, MmaTile_N, MmaTile_K>
+            TiledMMA tiled_mma,              // <    Mma_M,     Mma_N,     Mma_K>
+            ClusterShape_MNK cluster_shape,  // (ClusterM, ClusterN, ClusterK)
+            CUTE_GRID_CONSTANT TmaAtomA const tma_atom_A,
+            CUTE_GRID_CONSTANT TmaAtomB const tma_atom_B,
+            Alpha alpha, Beta beta)
+{
+  // Step 1: The Prologue.
+
+  // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename TiledMMA::AtomThrID{}));
+
+  // Construct the MMA grid coordinate from the CTA grid coordinate
+  auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+                                   blockIdx.x / size<0>(cluster_layout_vmnk), //    MMA-M coordinate
+                                   blockIdx.y,                                //    MMA-N coordinate
+                                   _);                                        //    MMA-K coordinate
+
+  // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+  //   by this mma tile.
+  // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+  //   * Tensor to partition
+  //   * Tiler to use for partitioning
+  //   * Coordinate to use for slicing the partitioned tensor
+  //   * Projection to ignore unwanted modes of the Tiler and Coordinate
+  auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+  Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{});  // (MmaTile_M, MmaTile_K, Tiles_K)
+  Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{});  // (MmaTile_N, MmaTile_K, Tiles_K)
+  Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+  Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+
+  if (thread0()) {
+    print("mA:\t"); print(mA); print("\n");   // mA:   ArithTuple(_0,_0) o (512,256):(_1@1,_1@0)
+    print("mB:\t"); print(mB); print("\n");   // mB:   ArithTuple(_0,_0) o (1024,256):(_1@1,_1@0)
+    print("mC:\t"); print(mC); print("\n");   // mC:   gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+    print("mD:\t"); print(mD); print("\n");   // mD:   gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+    print("gA:\t"); print(gA); print("\n");   // gA:   ArithTuple(_0,0) o (_128,_64,4):(_1@1,_1@0,_64@0)
+    print("gB:\t"); print(gB); print("\n");   // gB:   ArithTuple(_0,0) o (_256,_64,4):(_1@1,_1@0,_64@0)
+    print("gC:\t"); print(gC); print("\n");   // gC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+    print("gD:\t"); print(gD); print("\n");   // gD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+  } __syncthreads();
+
+  // The SMEM tensors
+
+  // Allocate SMEM
+  extern __shared__ char shared_memory[];
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Represent the SMEM buffers for A and B
+  Tensor tCsA = shared_storage.tensor_sA();         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCsB = shared_storage.tensor_sB();         // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  //
+  // Mma partitioning for A and B
+  //
+
+  auto mma_v = get<0>(mma_coord_vmnk);
+  ThrMMA cta_mma = tiled_mma.get_slice(mma_v);   // Use Peer CTA coordinate
+  Tensor tCgA = cta_mma.partition_A(gA);         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCgB = cta_mma.partition_B(gB);         // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+  Tensor tCgC = cta_mma.partition_C(gC);         // (MmaC, NumMma_M, NumMma_N)
+  Tensor tCgD = cta_mma.partition_C(gD);         // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCgA:\t"); print(tCgA); print("\n");  // tCgA:   ArithTuple(_0,0) o ((_128,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgB:\t"); print(tCgB); print("\n");  // tCgB:   ArithTuple(_0,0) o ((_256,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgC:\t"); print(tCgC); print("\n");  // tCgC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+    print("tCgD:\t"); print(tCgD); print("\n");  // tCgD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+  } __syncthreads();
+
+  // MMA Fragment Allocation
+  // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+  // For tcgen05.mma operations:
+  // - Matrices A and B are sourced from SMEM
+  // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+  // - The first mode of each descriptor represents the SMEM for a single MMA operation
+  Tensor tCrA = cta_mma.make_fragment_A(tCsA);      // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCrB = cta_mma.make_fragment_B(tCsB);      // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  // TMEM Allocation
+  // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+  // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+  Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCrA:\t"); print(tCrA); print("\n");     // tCrA:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCrB:\t"); print(tCrB); print("\n");     // tCrB:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+  } __syncthreads();
+
+  // TMA Setup
+  //
+  //   These are TMA partitionings, which have a dedicated custom partitioner.
+  //   In this example, the TMA multicasts the loads across multiple CTAs.
+  //   Loads of A are multicasted along the N dimension of the cluster_shape_MNK and
+  //   Loads of B are multicasted along the M dimension of the cluster_shape_MNK.
+  //      Any multicasting must be in conformance with tma_x constructed with make_tma_atom on host.
+  //   For A tensor: The group_modes<0,3> transforms the (MmaA, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaA, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile MK.
+  //   For B tensor: The group_modes<0,3> transforms the (MmaB, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaB, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile NK.
+  //   Simply put, the TMA will be responsible for everything in mode-0 with a single call to cute::copy.
+  //   The tma_partition reorders and offsets mode-0 according to the tma_x atom and the multicast info.
+
+  // Each CTA with the same m-coord will load a portion of A
+  // Each CTA with the same n-coord will load a portion of B
+  // Multicast behavior for CTA 1,2 in the cluster
+  //   A multicast            B multicast
+  //    0  1  2  3             0  1  2  3
+  // 0  -  -  -  -          0  -  -  X  -
+  // 1  X  X  X  X          1  -  -  X  -
+  // 2  -  -  -  -          2  -  -  X  -
+  // 3  -  -  -  -          3  -  -  X  -
+  // tma_multicast_mask_A = 0x2222
+  // tma_multicast_mask_B = 0x0F00
+  // mma_multicast_mask_C = 0x2F22
+
+  // Construct the CTA-in-Cluster coordinate for multicasting
+  auto cta_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(int(cute::block_rank_in_cluster()));
+
+  // Project the cluster_layout for tma_A along the N-modes
+  auto [tAgA, tAsA] = tma_partition(tma_atom_A,
+                                    get<2>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along N mode of the cluster
+                                    make_layout(size<2>(cluster_layout_vmnk)),  // The CTA layout along N mode of the cluster
+                                    group_modes<0,3>(tCsA), group_modes<0,3>(tCgA));
+
+  // Project the cluster_layout for tma_B along the M-modes
+  auto [tBgB, tBsB] = tma_partition(tma_atom_B,
+                                    get<1>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along M mode of the cluster
+                                    make_layout(size<1>(cluster_layout_vmnk)),  // The CTA layout along M mode of the cluster
+                                    group_modes<0,3>(tCsB), group_modes<0,3>(tCgB));
+
+  // Project the cluster_layout and cta_coord along the N-mode to determine the multicast mask for A
+  uint16_t tma_mcast_mask_a = create_tma_multicast_mask<2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the M-mode to determine the multicast mask for B
+  uint16_t tma_mcast_mask_b = create_tma_multicast_mask<1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the VM + VN-modes to determine the multicast mask for C
+  uint16_t mma_mcast_mask_c = create_tma_multicast_mask<0,1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk) |
+                              create_tma_multicast_mask<0,2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+
+  // Calculate total bytes that TMA will transfer each tile to track completion
+  int tma_transaction_bytes = sizeof(make_tensor_like(tAsA))
+                            + sizeof(make_tensor_like(tBsB));
+
+  if (thread0()) {
+    print("tAgA:\t"); print(tAgA); print("\n");  // tAgA:   ArithTuple(_0,0) o (((_64,_128),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tAsA:\t"); print(tAsA); print("\n");  // tAsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_8192,_1)):((_1,_0))
+    print("tBgB:\t"); print(tBgB); print("\n");  // tBgB:   ArithTuple(_0,0) o (((_64,_256),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tBsB:\t"); print(tBsB); print("\n");  // tBsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_16384,_1)):((_1,_0))
+    printf("tma_transaction_bytes: %d\n", tma_transaction_bytes);
+    printf("tma_mcast_mask_a: %x\n", tma_mcast_mask_a);
+    printf("tma_mcast_mask_b: %x\n", tma_mcast_mask_b);
+    printf("mma_mcast_mask_c: %x\n", mma_mcast_mask_c);
+  } __syncthreads();
+
+  // Barrier Initialization
+
+  uint32_t elect_one_thr  = cute::elect_one_sync();
+  uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+  // Barriers in SMEM initialized by a single thread.
+  if (elect_one_warp && elect_one_thr) {
+    // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
+    int num_mcast_participants = size<1>(cluster_layout_vmnk) + size<2>(cluster_layout_vmnk) - 1;
+    cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ num_mcast_participants);
+    cute::initialize_barrier(shared_storage.tma_barrier, /* num_threads */ 1);
+  }
+  int mma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  int tma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  cute::cluster_sync();           // Make sure all threads across all CTAs in Cluster observe barrier initialization.
+
+  // Step 2: The Mainloop.
+
+  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+  for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+  {
+    // Step 2a: Load A and B tiles
+
+    // TMA Load Operations:
+    // - Execute asynchronous TMA loads with single thread
+    // - Set transaction bytes and execute with barrier
+    if (elect_one_warp && elect_one_thr) {
+      cute::set_barrier_transaction_bytes(shared_storage.tma_barrier, tma_transaction_bytes);
+      copy(tma_atom_A.with(shared_storage.tma_barrier,tma_mcast_mask_a), tAgA(_,k_tile), tAsA); // Load MmaTile_M x MmaTile_K A tile
+      copy(tma_atom_B.with(shared_storage.tma_barrier,tma_mcast_mask_b), tBgB(_,k_tile), tBsB); // Load MmaTile_N x MmaTile_K B tile
+    }
+
+    // Step 2b: Execute the MMAs for this tile
+
+    // Wait for TMA loads to SMEM to complete
+    cute::wait_barrier(shared_storage.tma_barrier, tma_barrier_phase_bit);
+    tma_barrier_phase_bit ^= 1;
+
+    // tcgen05.mma instructions require single-thread execution:
+    // - Only one warp performs the MMA-related loop operations
+    // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+    // - No explicit elect_one_sync region is needed from the user
+    if (elect_one_warp) {
+      // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+      cutlass::arch::umma_arrive_multicast(&shared_storage.mma_barrier, mma_mcast_mask_c); // All multicasting CTAs encoded in mask.
+    }
+    // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+    cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+    mma_barrier_phase_bit ^= 1;
+  }
+
+  // Step 3: The Epilogue.
+
+  // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
+  ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
+
+  Tensor tDgC = thr_t2r_copy.partition_D(tCgC);                   // (CpyD, NumCpy_M, NumCpy_N)
+  Tensor tDrC = make_fragment_like(tDgC);                         // (CpyD, NumCpy_M, NumCpy_N)
+  // Load C tensor GMEM -> RMEM
+  copy(tDgC, tDrC);
+
+  Tensor tDtAcc = thr_t2r_copy.partition_S(tCtAcc);               // (CpyS, NumCpy_M, NumCpy_N)
+  Tensor tDgD   = thr_t2r_copy.partition_D(tCgD);                 // (CpyD, NumCpy_M, NumCpy_N)
+  using AccType = typename decltype(tCtAcc)::value_type;
+  Tensor tDrAcc = make_tensor<AccType>(shape(tDgD));              // (CpyD, NumCpy_M, NumCpy_N)
+  // Load TMEM -> RMEM
+  copy(tiled_t2r_copy, tDtAcc, tDrAcc);
+
+  // AXPBY RMEM -> RMEM: tDrC = alpha * tDrAcc + beta * tDrC
+  axpby(alpha, tDrAcc, beta, tDrC);
+  // Store RMEM -> GMEM
+  copy(tDrC, tDgD);
+}
+
+template <class TypeA, class LayoutA,
+          class TypeB, class LayoutB,
+          class TypeC, class LayoutC,
+          class TypeD, class LayoutD,
+          class Alpha, class Beta>
+void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
+                                   TypeB const* device_ptr_B, LayoutB layout_B,
+                                   TypeC const* device_ptr_C, LayoutC layout_C,
+                                   TypeD      * device_ptr_D, LayoutD layout_D,
+                                   Alpha const alpha, Beta const beta)
+{
+  assert(shape<0>(layout_A) == shape<0>(layout_C));  // Gemm_M
+  assert(shape<0>(layout_A) == shape<0>(layout_D));  // Gemm_M
+  assert(shape<0>(layout_B) == shape<1>(layout_C));  // Gemm_N
+  assert(shape<0>(layout_B) == shape<1>(layout_D));  // Gemm_N
+  assert(shape<1>(layout_A) == shape<1>(layout_B));  // Gemm_K
+
+  // Represent the full tensors in global memory
+  Tensor mA = make_tensor(make_gmem_ptr(device_ptr_A), layout_A);      // (Gemm_M, Gemm_K)
+  Tensor mB = make_tensor(make_gmem_ptr(device_ptr_B), layout_B);      // (Gemm_N, Gemm_K)
+  Tensor mC = make_tensor(make_gmem_ptr(device_ptr_C), layout_C);      // (Gemm_M, Gemm_N)
+  Tensor mD = make_tensor(make_gmem_ptr(device_ptr_D), layout_D);      // (Gemm_M, Gemm_N)
+
+  // Get M, N, K dimensions of the GEMM we are running
+  auto Gemm_M = shape<0>(layout_A);
+  auto Gemm_N = shape<0>(layout_B);
+  auto Gemm_K = shape<1>(layout_A);
+  std::cout << "Running for problem shape (MxNxK): " << Gemm_M << "x" << Gemm_N << "x" << Gemm_K << std::endl;
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Initialize the GEMM kernel parameters
+  //
+  ////////////////////////////////////////////////////////////
+
+  // Create TiledMma. make_tiled_mma takes the target instructions and an (optional) instruction layout as parameters to create a
+  // larger TiledMma from the given mma instruction.
+  // See cute/arch/mma_sm100_umma.hpp for all tcgen05.mma instructions
+  TiledMMA tiled_mma = make_tiled_mma(SM100_MMA_F16BF16_SS<TypeA, TypeB, TypeC,                 // Mma's A, B, and Accumulator types
+                                                           128, 256,                            // Mma M and N dimensions
+                                                           UMMA::Major::K, UMMA::Major::K>{});  // A and B layouts
+
+  // We can also print and inspect the tiled_mma
+  print(tiled_mma);
+  // TiledMMA
+  //   ThrLayoutVMNK:  (_1,_1,_1,_1):(_0,_0,_0,_0)
+  //   PermutationMNK: (_,_,_)
+  // MMA_Atom
+  //   ThrID:      _1:_0
+  //   Shape_MNK:  (_128,_256,_16)                      // MmaM, MmaN, MmaK instruction size
+  //   LayoutA_TV: (_1,(_128,_16)):(_0,(_1,_128))       // TV -> MmaCoordinate mapping for A matrix
+  //   LayoutB_TV: (_1,(_256,_16)):(_0,(_1,_256))       // TV -> MmaCoordinate mapping for B matrix
+  //   LayoutC_TV: (_1,(_128,_256)):(_0,(_1,_128))      // TV -> MmaCoordinate mapping for C matrix
+
+  // Define MMA tiler sizes (static)
+  auto bM = tile_size<0>(tiled_mma);             // MMA Tile M. We'll use 1 MMAs per MMA Tile M.
+  auto bN = tile_size<1>(tiled_mma);             // MMA Tile N. We'll use 1 MMAs per MMA Tile M.
+  auto bK = tile_size<2>(tiled_mma) * Int<4>{};  // MMA Tile K. We'll use 4 MMAs per MMA Tile K. For 16b types, tcgen05.mma has K16.
+  auto mma_tiler = make_shape(bM, bN, bK);       // (MMA_M, MMA_N, MMA_K)
+
+  // In SM90,  the MMAs are CTA-local and perform thread-level partitioning.
+  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
+  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
+  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
+  //  The MMA's partitioning then yeilds the CTA-local work.
+
+  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
+    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
+    return;
+  }
+
+  if (not evenly_divides(make_shape(Gemm_M, Gemm_N, Gemm_K), mma_tiler)) {
+    std::cerr << "OOB accesses are not supported. MmaTiler_MNK should evenly divide ProblemShape_MNK." << std::endl;
+    return;
+  }
+
+  //
+  // Determine the SMEM layouts:
+  //
+
+  //  * SMEM layouts for A and B must match the post-partitioned (CTA-local) shapes expected by the MMA instructions.
+  //  * CuTe provides partition_shape_[A|B] functions to determine the post-partitioned shape.
+  //    These functions take the TiledMma, and the MMA Tile Shape as inputs and returns a shape that is at least rank-3
+  //    where the first mode has the same shape as the MMA instruction, 2nd and 3rd mode expresses the number of time
+  //    MMA instr is repeated in M/N mode and K mode of MMA tile, respectively.
+  //  * Note that SMEM layouts are needed to determine SMEM allocation for kernel launch.
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_K) to post-partitioned (MmaA, NumMma_M, NumMma_K)
+  auto mma_shape_A = partition_shape_A(tiled_mma, make_shape(size<0>(mma_tiler), size<2>(mma_tiler)));
+  // Pre-partitioned Tile Shape (MmaTile_N, MmaTile_K) to post-partitioned (MmaB, NumMma_N, NumMma_K)
+  auto mma_shape_B = partition_shape_B(tiled_mma, make_shape(size<1>(mma_tiler), size<2>(mma_tiler)));
+
+  // Print and inspect mma_shape_A, and mma_shape_B for this example.
+  print("mma_shape_A:\t"); print(mma_shape_A); print("\n");  // mma_shape_A:  ((_128,_16),_1,_4)
+  print("mma_shape_B:\t"); print(mma_shape_B); print("\n");  // mma_shape_B:  ((_256,_16),_1,_4)
+
+  // A and B tensors are swizzled in SMEM to improve MMA performance.
+  //  * However, expressing swizzled layouts is very hard.
+  //  * CuTe provides tile_to_mma_shape functions for SM100 to create swizzled layouts for post-partitioned Mma Shapes
+  auto sA_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeA>{}, mma_shape_A);
+  auto sB_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeB>{}, mma_shape_B);
+
+  // Print and inspect sA_layout and sB_layout for this example.
+  print("sA_layout:\t"); print(sA_layout); print("\n");      // sA_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+  print("sB_layout:\t"); print(sB_layout); print("\n");      // sB_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+
+  // Now we can find the SMEM allocation size
+  using SMEMStorage = SharedStorage<TypeA, TypeB, decltype(sA_layout), decltype(sB_layout)>;
+
+  //
+  // TMA Descriptor Creation (Host Side)
+  //
+
+  // The cluster shape and layout
+  auto cluster_shape = make_shape(Int<4>{}, Int<4>{}, Int<1>{});
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename decltype(tiled_mma)::AtomThrID{}));
+
+  Copy_Atom tma_atom_A = make_tma_atom(
+      SM90_TMA_LOAD_MULTICAST{},       // TMA load operation with multicast
+      mA,                              // Source GMEM tensor
+      sA_layout,                       // Destination SMEM layout
+      select<0,2>(mma_tiler),          // MK Tiler for TMA operation
+      size<2>(cluster_layout_vmnk)     // The number of CTAs in the N-mode for multicasting
+    );
+  Tensor mA_tma = tma_atom_A.get_tma_tensor(shape(mA));   // (Gemm_M, Gemm_K)
+
+  print("tma_atom_A:\t"); print(tma_atom_A); print("\n");
+  // tma_atom_A:     Copy_Atom
+  //  ThrID:        _1:_0
+  //  ValLayoutSrc: (_1,_8192):(_0,_1)
+  //  ValLayoutDst: (_1,_8192):(_0,_1)
+  //  ValLayoutRef: (_1,_8192):(_0,_1)
+  //  ValueType:    16b
+
+  Copy_Atom tma_atom_B = make_tma_atom(
+      SM90_TMA_LOAD_MULTICAST{},      // TMA load operation with multicast
+      mB,                             // Source GMEM tensor
+      sB_layout,                      // Destination SMEM layout
+      select<1,2>(mma_tiler),         // NK Tiler for TMA operation
+      size<1>(cluster_layout_vmnk)    // The number of CTAs in the M-mode for multicasting
+    );
+  Tensor mB_tma = tma_atom_B.get_tma_tensor(shape(mB));   // (Gemm_N, Gemm_K)
+
+  print("tma_atom_B:\t"); print(tma_atom_B); print("\n");
+  // tma_atom_B:     Copy_Atom
+  //  ThrID:        _1:_0
+  //  ValLayoutSrc: (_1,_16384):(_0,_1)
+  //  ValLayoutDst: (_1,_16384):(_0,_1)
+  //  ValLayoutRef: (_1,_16384):(_0,_1)
+  //  ValueType:    16b
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Launch GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  dim3 dimBlock(128);
+  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
+  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
+               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SMEMStorage);
+
+  auto* kernel_ptr = &gemm_device<SMEMStorage,
+                                  decltype(mA_tma), decltype(mB_tma), decltype(mC), decltype(mD),
+                                  decltype(mma_tiler), decltype(tiled_mma), decltype(cluster_shape),
+                                  decltype(tma_atom_A), decltype(tma_atom_B), // Includes the TMA descriptor.
+                                  Alpha, Beta>;
+
+  // Set kernel attributes (set SMEM)
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  printf("Grid launched: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
+  printf("Cluster launched: %d, %d, %d\n", dimCluster.x, dimCluster.y, dimCluster.z);
+
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             mA_tma, mB_tma, mC, mD,
+                                                             mma_tiler, tiled_mma, cluster_shape,
+                                                             tma_atom_A, tma_atom_B,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if ((props.major != 10) || (props.major == 10 && props.minor > 1)) {
+    std::cerr << "This example requires NVIDIA's Blackwell Architecture GPU with compute capability 100a." << std::endl;
+    std::cerr << "  Found " << props.major << "." << props.minor << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  int Gemm_M = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &Gemm_M);
+
+  int Gemm_N = 1024;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &Gemm_N);
+
+  int Gemm_K = 256;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &Gemm_K);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Create A, B, C, and D tensors
+  //
+  ////////////////////////////////////////////////////////////
+  // Define the data types. A and B types are same for MMA instruction.
+  using TypeA = cutlass::half_t; // MMA A Data Type
+  auto type_str_a = "half_t";
+  using TypeB = cutlass::half_t; // MMA B Data Type
+  auto type_str_b = "half_t";
+  using TypeC = float;           // MMA C Data Type
+  [[maybe_unused]] auto type_str_c = "float";
+  using TypeD = float;           // MMA D Data Type
+  auto type_str_d = "float";
+  using TypeAccumulator = float; // Both TypeC and TypeD are float, use float accumulator type.
+
+  // A tensor MxK K-major (Layout T = Row-Major)
+  Layout layout_A = make_layout(make_shape (Gemm_M,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_M,Gemm_K):(Gemm_K,_1)
+  // B tensor NxK K-major (Layout N = Column-Major)
+  Layout layout_B = make_layout(make_shape (Gemm_N,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_N,Gemm_K):(Gemm_K,_1)
+  // C tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_C = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+  // D tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_D = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+
+  // Host allocations and host CuTe tensors for A, B, and C tensors.
+  thrust::host_vector<TypeA>   host_A(Gemm_M * Gemm_K);
+  Tensor host_tensor_A = make_tensor(host_A.data(), layout_A);
+  print("host_tensor_A:\t"); print(host_tensor_A); print("\n"); // host_tensor_A:	ptr[16b](ADDR_A) o (512,256):(256,_1)
+
+  thrust::host_vector<TypeB>   host_B(Gemm_N * Gemm_K);
+  Tensor host_tensor_B = make_tensor(host_B.data(), layout_B);
+  print("host_tensor_B:\t"); print(host_tensor_B); print("\n"); // host_tensor_B:	ptr[16b](ADDR_B) o (1024,256):(256,_1)
+
+  thrust::host_vector<TypeC>   host_C(Gemm_M * Gemm_N);
+  Tensor host_tensor_C = make_tensor(host_C.data(), layout_C);
+  print("host_tensor_C:\t"); print(host_tensor_C); print("\n"); // host_tensor_C:	ptr[32b](ADDR_C) o (512,1024):(1024,_1)
+
+  // Note that we don't need a host_tensor for D yet.
+  thrust::device_vector<TypeD> device_D(Gemm_M * Gemm_N);
+
+  // Initialize A, B, and C tensors with random values.
+  initialize_tensor(host_tensor_A);
+  initialize_tensor(host_tensor_B);
+  initialize_tensor(host_tensor_C);
+
+  // Copy A, B, and C tensors from host memory to device memory
+  thrust::device_vector<TypeA> device_A = host_A;
+  thrust::device_vector<TypeB> device_B = host_B;
+  thrust::device_vector<TypeC> device_C = host_C;
+
+  using Alpha = float;
+  using Beta = float;
+  Alpha alpha = 1.0f;
+  Beta beta = 0.0f;
+  // Setup input and output tensors, and the kernel parameters; and execute the kernel on device
+  gemm_host_f16xf16_f32_f32_tnt(device_A.data().get(), layout_A,
+                                device_B.data().get(), layout_B,
+                                device_C.data().get(), layout_C,
+                                device_D.data().get(), layout_D,
+                                alpha, beta);
+  // Host allocation for D tensor and transfer D tensor from device to host
+  thrust::host_vector<TypeD> host_D = device_D;
+  // Create a non-owning CuTe tensor for D tensor
+  Tensor host_tensor_D = make_tensor(host_D.data(), layout_D);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Execute reference GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  thrust::host_vector<TypeD> host_reference_D(Gemm_M*Gemm_N);
+  auto host_reference_tensor_D = make_tensor(host_reference_D.data(), layout_D);
+  reference_gemm<TypeAccumulator>(host_tensor_A, host_tensor_B, host_tensor_C, host_reference_tensor_D, alpha, beta);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Compare results
+  //
+  ////////////////////////////////////////////////////////////
+
+  auto relative_error = print_matrix_multiply_mollified_relative_error(type_str_a, host_tensor_A,
+                                                                       type_str_b, host_tensor_B,
+                                                                       type_str_d, host_tensor_D, host_reference_tensor_D);
+  bool success = relative_error <= 0.0;
+  std::cout << "Execution is " << ((success) ? "successful." : "failed.") << std::endl;
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
new file mode 100644
index 00000000..290436ea
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
@@ -0,0 +1,716 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                             CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <cstdio>
+
+// Use Thrust to handle host/device allocations
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+// Cutlass includes
+#include <cutlass/half.h>                       // F16 data type
+#include <cutlass/util/print_error.hpp>
+#include <cutlass/arch/barrier.h>
+#include <cutlass/cluster_launch.hpp>
+
+// CuTe includes
+#include <cute/tensor.hpp>                      // CuTe tensor implementation
+#include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
+#include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
+#include <cute/algorithm/cooperative_copy.hpp>
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 04: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// Key extensions to tutorial 03_mma_tma_multicast_sm100.cu:
+// 1. Introduce 2SM tcgen05.mma instructions
+// 2. Introduce 2SM TMA instructions
+// 3. Demonstrate TMA multicast pattern specialized for 2SM instructions for loading A and B matrices
+//
+// This GEMM kernel will perform the following steps:
+// 1. Load A and B matrices from GMEM to SMEM using Multicasted TMA.2SM load operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using 2SM tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to register (RMEM).
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from registers (RMEM) to global memory (GMEM).
+//
+// SM100 2SM tcgen05.mma instructions operate as follows:
+// - Mma is launched by only one SM
+//    With 2SM MMA instructions, only 1 of the 2 CTAs collaborating on MMA executes the instruction.
+//    We call the collaborating CTAs, peer CTAs. And the CTA executing the MMA instruction is called leader CTA.
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+//   and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+//   These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 256x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t;  // MMA A Data Type
+// TypeB = cutlass::half_t;  // MMA B Data Type
+// TypeC = float;            // MMA C Data Type
+// TypeD = float;            // MMA D Data Type
+// TypeAccumulator = float;  // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A and B matrices.
+template <class TypeA,           // Tensor A data type
+          class TypeB,           // Tensor B data type
+          class ASmemLayout,     // (MmaA, NumMma_M, NumMma_K, ...)
+          class BSmemLayout>     // (MmaB, NumMma_N, NumMma_K, ...)
+struct SharedStorage
+{
+  alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
+  alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
+
+  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+  alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
+
+  CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
+};
+
+// The device kernel
+template <class SharedStorage,
+          class ATensor, class BTensor, class CTensor, class DTensor,
+          class MmaTiler_MNK, class TiledMMA, class ClusterShape_MNK,
+          class TmaAtomA, class TmaAtomB,
+          class Alpha, class Beta>
+__global__ static
+void
+gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
+            BTensor mB,                      // (Gemm_N, Gemm_K)
+            CTensor mC,                      // (Gemm_M, Gemm_N)
+            DTensor mD,                      // (Gemm_M, Gemm_N)
+            MmaTiler_MNK mma_tiler,          // <MmaTile_M, MmaTile_N, MmaTile_K>
+            TiledMMA tiled_mma,              // <    Mma_M,     Mma_N,     Mma_K>
+            ClusterShape_MNK cluster_shape,  // (ClusterM, ClusterN, ClusterK)
+            CUTE_GRID_CONSTANT TmaAtomA const tma_atom_A,
+            CUTE_GRID_CONSTANT TmaAtomB const tma_atom_B,
+            Alpha alpha, Beta beta)
+{
+  // Step 1: The Prologue.
+
+  // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename TiledMMA::AtomThrID{}));
+
+  // Construct the MMA grid coordinate from the CTA grid coordinate
+  auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+                                   blockIdx.x / size<0>(cluster_layout_vmnk), //    MMA-M coordinate
+                                   blockIdx.y,                                //    MMA-N coordinate
+                                   _);                                        //    MMA-K coordinate
+
+  // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+  //   by this mma tile.
+  // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+  //   * Tensor to partition
+  //   * Tiler to use for partitioning
+  //   * Coordinate to use for slicing the partitioned tensor
+  //   * Projection to ignore unwanted modes of the Tiler and Coordinate
+  auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+  Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{});  // (MmaTile_M, MmaTile_K, Tiles_K)
+  Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{});  // (MmaTile_N, MmaTile_K, Tiles_K)
+  Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+  Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+
+  if (thread0()) {
+    print("mA:\t"); print(mA); print("\n");   // mA:   ArithTuple(_0,_0) o (512,256):(_1@1,_1@0)
+    print("mB:\t"); print(mB); print("\n");   // mB:   ArithTuple(_0,_0) o (1024,256):(_1@1,_1@0)
+    print("mC:\t"); print(mC); print("\n");   // mC:   gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+    print("mD:\t"); print(mD); print("\n");   // mD:   gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+    print("gA:\t"); print(gA); print("\n");   // gA:   ArithTuple(_0,0) o (_128,_64,4):(_1@1,_1@0,_64@0)
+    print("gB:\t"); print(gB); print("\n");   // gB:   ArithTuple(_0,0) o (_256,_64,4):(_1@1,_1@0,_64@0)
+    print("gC:\t"); print(gC); print("\n");   // gC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+    print("gD:\t"); print(gD); print("\n");   // gD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+  } __syncthreads();
+
+  // The SMEM tensors
+
+  // Allocate SMEM
+  extern __shared__ char shared_memory[];
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Represent the SMEM buffers for A and B
+  Tensor tCsA = shared_storage.tensor_sA();         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCsB = shared_storage.tensor_sB();         // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  //
+  // Mma partitioning for A and B
+  //
+
+  auto mma_v = get<0>(mma_coord_vmnk);
+  ThrMMA cta_mma = tiled_mma.get_slice(mma_v);   // Use Peer CTA coordinate
+  Tensor tCgA = cta_mma.partition_A(gA);         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCgB = cta_mma.partition_B(gB);         // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+  Tensor tCgC = cta_mma.partition_C(gC);         // (MmaC, NumMma_M, NumMma_N)
+  Tensor tCgD = cta_mma.partition_C(gD);         // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCgA:\t"); print(tCgA); print("\n");  // tCgA:   ArithTuple(_0,0) o ((_128,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgB:\t"); print(tCgB); print("\n");  // tCgB:   ArithTuple(_0,0) o ((_256,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgC:\t"); print(tCgC); print("\n");  // tCgC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+    print("tCgD:\t"); print(tCgD); print("\n");  // tCgD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+  } __syncthreads();
+
+  // MMA Fragment Allocation
+  // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+  // For tcgen05.mma operations:
+  // - Matrices A and B are sourced from SMEM
+  // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+  // - The first mode of each descriptor represents the SMEM for a single MMA operation
+  Tensor tCrA = cta_mma.make_fragment_A(tCsA);      // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCrB = cta_mma.make_fragment_B(tCsB);      // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  // TMEM Allocation
+  // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+  // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+  Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCrA:\t"); print(tCrA); print("\n");     // tCrA:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCrB:\t"); print(tCrB); print("\n");     // tCrB:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+  } __syncthreads();
+
+  // TMA Setup
+  //
+  //   These are TMA partitionings, which have a dedicated custom partitioner.
+  //   In this example, the TMA multicasts the loads across multiple CTAs.
+  //   Loads of A are multicasted along the N dimension of the cluster_shape_VMNK and
+  //   Loads of B are multicasted along the M dimension of the cluster_shape_VMNK.
+  //      Any multicasting must be in conformance with tma_x constructed with make_tma_atom on host.
+  //   For A tensor: The group_modes<0,3> transforms the (MmaA, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaA, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile MK.
+  //   For B tensor: The group_modes<0,3> transforms the (MmaB, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaB, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile NK.
+  //   Simply put, the TMA will be responsible for everything in mode-0 with a single call to cute::copy.
+  //   The tma_partition reorders and offsets mode-0 according to the tma_x atom and the multicast info.
+
+  // Each CTA with the same m-coord will load a portion of A
+  // Each CTA with the same n-coord will load a portion of B
+  // Computation of the multicast masks must take into account the Peer CTA for TMA.2SM
+
+  // Construct the CTA-in-Cluster coordinate for multicasting
+  auto cta_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(int(cute::block_rank_in_cluster()));
+
+  // Project the cluster_layout for tma_A along the N-modes
+  auto [tAgA, tAsA] = tma_partition(tma_atom_A,
+                                    get<2>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along N mode of the cluster
+                                    make_layout(size<2>(cluster_layout_vmnk)),  // The CTA layout along N mode of the cluster
+                                    group_modes<0,3>(tCsA), group_modes<0,3>(tCgA));
+
+  // Project the cluster_layout for tma_B along the M-modes
+  auto [tBgB, tBsB] = tma_partition(tma_atom_B,
+                                    get<1>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along M mode of the cluster
+                                    make_layout(size<1>(cluster_layout_vmnk)),  // The CTA layout along M mode of the cluster
+                                    group_modes<0,3>(tCsB), group_modes<0,3>(tCgB));
+
+  // Project the cluster_layout and cta_coord along the N-mode to determine the multicast mask for A
+  uint16_t tma_mcast_mask_a = create_tma_multicast_mask<2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the M-mode to determine the multicast mask for B
+  uint16_t tma_mcast_mask_b = create_tma_multicast_mask<1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the VM + VN-modes to determine the multicast mask for C
+  uint16_t mma_mcast_mask_c = create_tma_multicast_mask<0,1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk) |
+                              create_tma_multicast_mask<0,2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+
+  // Calculate total bytes that TMA will transfer each tile to track completion, accounting for TMA.2SM
+  int tma_transaction_bytes = size<0>(cluster_layout_vmnk) * sizeof(make_tensor_like(tAsA))
+                            + size<0>(cluster_layout_vmnk) * sizeof(make_tensor_like(tBsB));
+
+  if (thread0()) {
+    print("tAgA:\t"); print(tAgA); print("\n");  // tAgA:   ArithTuple(_0,0) o (((_64,_128),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tAsA:\t"); print(tAsA); print("\n");  // tAsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_8192,_1)):((_1,_0))
+    print("tBgB:\t"); print(tBgB); print("\n");  // tBgB:   ArithTuple(_0,0) o (((_64,_256),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tBsB:\t"); print(tBsB); print("\n");  // tBsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_16384,_1)):((_1,_0))
+    printf("tma_transaction_bytes: %d\n", tma_transaction_bytes);
+    printf("tma_mcast_mask_a: %x\n", tma_mcast_mask_a);
+    printf("tma_mcast_mask_b: %x\n", tma_mcast_mask_b);
+    printf("mma_mcast_mask_c: %x\n", mma_mcast_mask_c);
+  } __syncthreads();
+
+  // Barrier Initialization
+  auto elect_one_thr  = cute::elect_one_sync();
+  auto elect_one_warp = (threadIdx.x / 32 == 0);
+  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
+
+  // Barriers in SMEM should be initialized by a single thread.
+  if (elect_one_warp && elect_one_thr) {
+    // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
+    int num_mcast_participants = size<1>(cluster_layout_vmnk) + size<2>(cluster_layout_vmnk) - 1;
+    cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ num_mcast_participants);
+    cute::initialize_barrier(shared_storage.tma_barrier, /* num_threads */ 1);
+  }
+  int mma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  int tma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  cute::cluster_sync();           // Make sure all CTAs in Cluster observe barrier init and TMEM alloc.
+
+  // Step 2: The Mainloop.
+
+  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+  for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+  {
+    // Step 2a: Load A and B tiles
+
+    // TMA Load Operations:
+    // - Execute asynchronous TMA loads with single thread
+    // - Both peer and leader CTAs initiate TMA loads
+    // - Set expected transaction bytes. For 2SM TMA instructions, the transaction bytes counts both CTAs.
+    // - Although TMAs are initiated by both peer and leader CTAs, the barrier is only set and waited by the leader CTA.
+    // - Initiate asynchronous transfers with a multicast mask that includes all CTAs that participate in multicast.
+    if (elect_one_warp && elect_one_thr) { // TMA loads are executed by one thread
+      if (elect_one_cta) { // Only the leader CTA waits for TMA transactions
+        cute::set_barrier_transaction_bytes(shared_storage.tma_barrier, tma_transaction_bytes); // Set the expected transaction bytes for the TMA loads
+      }
+      copy(tma_atom_A.with(shared_storage.tma_barrier,tma_mcast_mask_a), tAgA(_,k_tile), tAsA); // Load MmaTile_M x MmaTile_K A tile
+      copy(tma_atom_B.with(shared_storage.tma_barrier,tma_mcast_mask_b), tBgB(_,k_tile), tBsB); // Load MmaTile_N x MmaTile_K B tile
+    }
+
+    // Step 2b: Execute the MMAs for this tile
+
+    if (elect_one_cta) {
+      // Wait for TMA loads to complete on leader CTAs
+      cute::wait_barrier(shared_storage.tma_barrier, tma_barrier_phase_bit);
+      tma_barrier_phase_bit ^= 1;
+
+      // tcgen05.mma instructions require single-thread execution:
+      // - Only one warp performs the MMA-related loop operations
+      // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+      // - No explicit elect_one_sync region is needed from the user
+      if (elect_one_warp) {
+        // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+            gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+            tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+        // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+        cutlass::arch::umma_arrive_multicast_2x1SM(&shared_storage.mma_barrier, mma_mcast_mask_c); // All multicasting CTAs encoded in mask.
+      }
+    }
+    // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+    cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+    mma_barrier_phase_bit ^= 1;
+  }
+
+  // Step 3: The Epilogue.
+
+  // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
+  ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
+
+  Tensor tDgC = thr_t2r_copy.partition_D(tCgC);                   // (CpyD, NumCpy_M, NumCpy_N)
+  Tensor tDrC = make_fragment_like(tDgC);                         // (CpyD, NumCpy_M, NumCpy_N)
+  // Load C tensor GMEM -> RMEM
+  copy(tDgC, tDrC);
+
+  Tensor tDtAcc = thr_t2r_copy.partition_S(tCtAcc);               // (CpyS, NumCpy_M, NumCpy_N)
+  Tensor tDgD   = thr_t2r_copy.partition_D(tCgD);                 // (CpyD, NumCpy_M, NumCpy_N)
+  using AccType = typename decltype(tCtAcc)::value_type;
+  Tensor tDrAcc = make_tensor<AccType>(shape(tDgD));              // (CpyD, NumCpy_M, NumCpy_N)
+  // Load TMEM -> RMEM
+  copy(tiled_t2r_copy, tDtAcc, tDrAcc);
+
+  // AXPBY RMEM -> RMEM: tDrC = alpha * tDrAcc + beta * tDrC
+  axpby(alpha, tDrAcc, beta, tDrC);
+  // Store RMEM -> GMEM
+  copy(tDrC, tDgD);
+}
+
+template <class TypeA, class LayoutA,
+          class TypeB, class LayoutB,
+          class TypeC, class LayoutC,
+          class TypeD, class LayoutD,
+          class Alpha, class Beta>
+void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
+                                   TypeB const* device_ptr_B, LayoutB layout_B,
+                                   TypeC const* device_ptr_C, LayoutC layout_C,
+                                   TypeD      * device_ptr_D, LayoutD layout_D,
+                                   Alpha const alpha, Beta const beta)
+{
+  assert(shape<0>(layout_A) == shape<0>(layout_C));  // Gemm_M
+  assert(shape<0>(layout_A) == shape<0>(layout_D));  // Gemm_M
+  assert(shape<0>(layout_B) == shape<1>(layout_C));  // Gemm_N
+  assert(shape<0>(layout_B) == shape<1>(layout_D));  // Gemm_N
+  assert(shape<1>(layout_A) == shape<1>(layout_B));  // Gemm_K
+
+  // Represent the full tensors in global memory
+  Tensor mA = make_tensor(make_gmem_ptr(device_ptr_A), layout_A);      // (Gemm_M, Gemm_K)
+  Tensor mB = make_tensor(make_gmem_ptr(device_ptr_B), layout_B);      // (Gemm_N, Gemm_K)
+  Tensor mC = make_tensor(make_gmem_ptr(device_ptr_C), layout_C);      // (Gemm_M, Gemm_N)
+  Tensor mD = make_tensor(make_gmem_ptr(device_ptr_D), layout_D);      // (Gemm_M, Gemm_N)
+
+  // Get M, N, K dimensions of the GEMM we are running
+  auto Gemm_M = shape<0>(layout_A);
+  auto Gemm_N = shape<0>(layout_B);
+  auto Gemm_K = shape<1>(layout_A);
+  std::cout << "Running for problem shape (MxNxK): " << Gemm_M << "x" << Gemm_N << "x" << Gemm_K << std::endl;
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Initialize the GEMM kernel parameters
+  //
+  ////////////////////////////////////////////////////////////
+
+  // Create TiledMma. make_tiled_mma takes the target instructions and an (optional) instruction layout as parameters to create a
+  // larger TiledMma from the given mma instruction.
+  // See cute/arch/mma_sm100_umma.hpp for all tcgen05.mma instructions
+  TiledMMA tiled_mma = make_tiled_mma(SM100_MMA_F16BF16_2x1SM_SS<TypeA, TypeB, TypeC,                 // Mma's A, B, and Accumulator types
+                                                                 256, 256,                            // Mma M and N dimensions
+                                                                 UMMA::Major::K, UMMA::Major::K>{});  // A and B layouts
+
+  // We can also print and inspect the tiled_mma
+  print(tiled_mma);
+  // TiledMMA
+  //   ThrLayoutVMNK:  (_2,_1,_1,_1):(_1,_0,_0,_0)
+  //   PermutationMNK: (_,_,_)
+  // MMA_Atom
+  //   ThrID:      _2:_1
+  //   Shape_MNK:  (_256,_256,_16)                      // MmaM, MmaN, MmaK (MmaK is constant for each instr.)
+  //   LayoutA_TV: (_2,(_128,_16)):(_128,(_1,_256))     // TV -> MmaCoordinate mapping for A matrix
+  //   LayoutB_TV: (_2,(_128,_16)):(_128,(_1,_256))     // TV -> MmaCoordinate mapping for B matrix
+  //   LayoutC_TV: (_2,(_128,_256)):(_128,(_1,_256))    // TV -> MmaCoordinate mapping for B matrix
+
+  // Define MMA tiler sizes (static)
+  auto bM = tile_size<0>(tiled_mma);             // MMA Tile M. We'll use 1 MMAs per MMA Tile M.
+  auto bN = tile_size<1>(tiled_mma);             // MMA Tile N. We'll use 1 MMAs per MMA Tile M.
+  auto bK = tile_size<2>(tiled_mma) * Int<4>{};  // MMA Tile K. We'll use 4 MMAs per MMA Tile K. For 16b types, tcgen05.mma has K16.
+  auto mma_tiler = make_shape(bM, bN, bK);       // (MMA_M, MMA_N, MMA_K)
+
+  // In SM90,  the MMAs are CTA-local and perform thread-level partitioning.
+  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
+  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
+  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
+  //  The MMA's partitioning then yeilds the CTA-local work.
+
+  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
+    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
+    return;
+  }
+
+  if (not evenly_divides(make_shape(Gemm_M, Gemm_N, Gemm_K), mma_tiler)) {
+    std::cerr << "OOB accesses are not supported. MmaTiler_MNK should evenly divide ProblemShape_MNK." << std::endl;
+    return;
+  }
+
+  //
+  // Determine the SMEM layouts:
+  //
+
+  //  * SMEM layouts for A and B must match the post-partitioned (CTA-local) shapes expected by the MMA instructions.
+  //  * CuTe provides partition_shape_[A|B] functions to determine the post-partitioned shape.
+  //    These functions take the TiledMma, and the MMA Tile Shape as inputs and returns a shape that is at least rank-3
+  //    where the first mode has the same shape as the MMA instruction, 2nd and 3rd mode expresses the number of time
+  //    MMA instr is repeated in M/N mode and K mode of MMA tile, respectively.
+  //  * Note that SMEM layouts are needed to determine SMEM allocation for kernel launch.
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_K) to post-partitioned (MmaA, NumMma_M, NumMma_K)
+  auto mma_shape_A = partition_shape_A(tiled_mma, make_shape(size<0>(mma_tiler), size<2>(mma_tiler)));
+  // Pre-partitioned Tile Shape (MmaTile_N, MmaTile_K) to post-partitioned (MmaB, NumMma_N, NumMma_K)
+  auto mma_shape_B = partition_shape_B(tiled_mma, make_shape(size<1>(mma_tiler), size<2>(mma_tiler)));
+
+  // Print and inspect mma_shape_A, and mma_shape_B for this example.
+  print("mma_shape_A:\t"); print(mma_shape_A); print("\n");  // mma_shape_A:  ((_128,_16),_1,_4)
+  print("mma_shape_B:\t"); print(mma_shape_B); print("\n");  // mma_shape_B:  ((_256,_16),_1,_4)
+
+  // A and B tensors are swizzled in SMEM to improve MMA performance.
+  //  * However, expressing swizzled layouts is very hard.
+  //  * CuTe provides tile_to_mma_shape functions for SM100 to create swizzled layouts for post-partitioned Mma Shapes
+  auto sA_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeA>{}, mma_shape_A);
+  auto sB_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeB>{}, mma_shape_B);
+
+  // Print and inspect sA_layout and sB_layout for this example.
+  print("sA_layout:\t"); print(sA_layout); print("\n");      // sA_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+  print("sB_layout:\t"); print(sB_layout); print("\n");      // sB_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+
+  // Now we can find the SMEM allocation size
+  using SMEMStorage = SharedStorage<TypeA, TypeB, decltype(sA_layout), decltype(sB_layout)>;
+
+  //
+  // TMA Descriptor Creation (Host Side)
+  //
+
+  // The cluster shape and layout
+  auto cluster_shape = make_shape(Int<4>{}, Int<4>{}, Int<1>{});
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename decltype(tiled_mma)::AtomThrID{}));
+
+  // SM100 interface for creating TMA loads.
+  Copy_Atom tma_atom_A = make_tma_atom_A_sm100(
+      SM100_TMA_2SM_LOAD_MULTICAST{}, // TMA load operation -- Multicasting 2SM instruction.
+      mA,                             // Source GMEM tensor
+      sA_layout,                      // Destination SMEM layout
+      mma_tiler,                      // MmaTiler_MNK. Unlike Sm90 interface where the tiler only included M and K modes.
+      tiled_mma,                      // Sm100 also requires the TiledMma to perform CTA-level partitioning.
+      cluster_layout_vmnk);           // ClusterLayout_VMNK. Unlike Sm90 interface where only the multicasting mode is passed.
+                                      //   We have make_tma_atom_[A|B]_sm100 and which determines the multicast mode.
+  Tensor mA_tma = tma_atom_A.get_tma_tensor(shape(mA));   // (Gemm_M, Gemm_K)
+
+  print("tma_atom_A:\t"); print(tma_atom_A); print("\n");
+  // tma_atom_A:     Copy_Atom
+  //  ThrID:        _2:_1
+  //  ValLayoutSrc: (_2,_8192):(_8192,_1)
+  //  ValLayoutDst: (_2,_8192):(_8192,_1)
+  //  ValLayoutRef: (_2,_8192):(_8192,_1)
+  //  ValueType:    16b
+
+  // SM100 interface for creating TMA loads.
+  Copy_Atom tma_atom_B = make_tma_atom_B_sm100(
+    SM100_TMA_2SM_LOAD_MULTICAST{}, // TMA load operation -- Multicasting 2SM instruction.
+    mB,                             // Source GMEM tensor
+    sB_layout,                      // Destination SMEM layout
+    mma_tiler,                      // MmaTiler_MNK. Unlike Sm90 interface where the tiler only included M and K modes.
+    tiled_mma,                      // Sm100 also requires the TiledMma to perform CTA-level partitioning.
+    cluster_layout_vmnk);           // ClusterLayout_VMNK. Unlike Sm90 interface where only the multicasting mode is passed.
+                                    //   We have make_tma_atom_[A|B]_sm100 and which determines the multicast mode.
+  Tensor mB_tma = tma_atom_B.get_tma_tensor(shape(mB));   // (Gemm_N, Gemm_K)
+
+  print("tma_atom_B:\t"); print(tma_atom_B); print("\n");
+  // tma_atom_B:     Copy_Atom
+  // ThrID:        _2:_1
+  // ValLayoutSrc: (_2,_8192):(_8192,_1)
+  // ValLayoutDst: (_2,_8192):(_8192,_1)
+  // ValLayoutRef: (_2,_8192):(_8192,_1)
+  // ValueType:    16b
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Launch GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  dim3 dimBlock(128);
+  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
+  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
+               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SMEMStorage);
+
+  auto* kernel_ptr = &gemm_device<SMEMStorage,
+                                  decltype(mA_tma), decltype(mB_tma), decltype(mC), decltype(mD),
+                                  decltype(mma_tiler), decltype(tiled_mma), decltype(cluster_shape),
+                                  decltype(tma_atom_A), decltype(tma_atom_B), // Includes the TMA descriptor.
+                                  Alpha, Beta>;
+
+  // Set kernel attributes (set SMEM)
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  printf("Grid launched: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
+  printf("Cluster launched: %d, %d, %d\n", dimCluster.x, dimCluster.y, dimCluster.z);
+
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             mA_tma, mB_tma, mC, mD,
+                                                             mma_tiler, tiled_mma, cluster_shape,
+                                                             tma_atom_A, tma_atom_B,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if ((props.major != 10) || (props.major == 10 && props.minor > 1)) {
+    std::cerr << "This example requires NVIDIA's Blackwell Architecture GPU with compute capability 100a." << std::endl;
+    std::cerr << "  Found " << props.major << "." << props.minor << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  int Gemm_M = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &Gemm_M);
+
+  int Gemm_N = 1024;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &Gemm_N);
+
+  int Gemm_K = 256;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &Gemm_K);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Create A, B, C, and D tensors
+  //
+  ////////////////////////////////////////////////////////////
+  // Define the data types. A and B types are same for MMA instruction.
+  using TypeA = cutlass::half_t; // MMA A Data Type
+  auto type_str_a = "half_t";
+  using TypeB = cutlass::half_t; // MMA B Data Type
+  auto type_str_b = "half_t";
+  using TypeC = float;           // MMA C Data Type
+  [[maybe_unused]] auto type_str_c = "float";
+  using TypeD = float;           // MMA D Data Type
+  auto type_str_d = "float";
+  using TypeAccumulator = float; // Both TypeC and TypeD are float, use float accumulator type.
+
+  // A tensor MxK K-major (Layout T = Row-Major)
+  Layout layout_A = make_layout(make_shape (Gemm_M,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_M,Gemm_K):(Gemm_K,_1)
+  // B tensor NxK K-major (Layout N = Column-Major)
+  Layout layout_B = make_layout(make_shape (Gemm_N,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_N,Gemm_K):(Gemm_K,_1)
+  // C tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_C = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+  // D tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_D = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+
+  // Host allocations and host CuTe tensors for A, B, and C tensors.
+  thrust::host_vector<TypeA>   host_A(Gemm_M * Gemm_K);
+  Tensor host_tensor_A = make_tensor(host_A.data(), layout_A);
+  print("host_tensor_A:\t"); print(host_tensor_A); print("\n"); // host_tensor_A:	ptr[16b](ADDR_A) o (512,256):(256,_1)
+
+  thrust::host_vector<TypeB>   host_B(Gemm_N * Gemm_K);
+  Tensor host_tensor_B = make_tensor(host_B.data(), layout_B);
+  print("host_tensor_B:\t"); print(host_tensor_B); print("\n"); // host_tensor_B:	ptr[16b](ADDR_B) o (1024,256):(256,_1)
+
+  thrust::host_vector<TypeC>   host_C(Gemm_M * Gemm_N);
+  Tensor host_tensor_C = make_tensor(host_C.data(), layout_C);
+  print("host_tensor_C:\t"); print(host_tensor_C); print("\n"); // host_tensor_C:	ptr[32b](ADDR_C) o (512,1024):(1024,_1)
+
+  // Note that we don't need a host_tensor for D yet.
+  thrust::device_vector<TypeD> device_D(Gemm_M * Gemm_N);
+
+  // Initialize A, B, and C tensors with random values.
+  initialize_tensor(host_tensor_A);
+  initialize_tensor(host_tensor_B);
+  initialize_tensor(host_tensor_C);
+
+  // Copy A, B, and C tensors from host memory to device memory
+  thrust::device_vector<TypeA> device_A = host_A;
+  thrust::device_vector<TypeB> device_B = host_B;
+  thrust::device_vector<TypeC> device_C = host_C;
+
+  using Alpha = float;
+  using Beta = float;
+  Alpha alpha = 1.0f;
+  Beta beta = 0.0f;
+  // Setup input and output tensors, and the kernel parameters; and execute the kernel on device
+  gemm_host_f16xf16_f32_f32_tnt(device_A.data().get(), layout_A,
+                                device_B.data().get(), layout_B,
+                                device_C.data().get(), layout_C,
+                                device_D.data().get(), layout_D,
+                                alpha, beta);
+  // Host allocation for D tensor and transfer D tensor from device to host
+  thrust::host_vector<TypeD> host_D = device_D;
+  // Create a non-owning CuTe tensor for D tensor
+  Tensor host_tensor_D = make_tensor(host_D.data(), layout_D);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Execute reference GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  thrust::host_vector<TypeD> host_reference_D(Gemm_M*Gemm_N);
+  auto host_reference_tensor_D = make_tensor(host_reference_D.data(), layout_D);
+  reference_gemm<TypeAccumulator>(host_tensor_A, host_tensor_B, host_tensor_C, host_reference_tensor_D, alpha, beta);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Compare results
+  //
+  ////////////////////////////////////////////////////////////
+
+  auto relative_error = print_matrix_multiply_mollified_relative_error(type_str_a, host_tensor_A,
+                                                                       type_str_b, host_tensor_B,
+                                                                       type_str_d, host_tensor_D, host_reference_tensor_D);
+  bool success = relative_error <= 0.0;
+  std::cout << "Execution is " << ((success) ? "successful." : "failed.") << std::endl;
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
new file mode 100644
index 00000000..6d9ab03f
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
@@ -0,0 +1,825 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                             CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <cstdio>
+
+// Use Thrust to handle host/device allocations
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+// Cutlass includes
+#include <cutlass/half.h>                       // F16 data type
+#include <cutlass/util/print_error.hpp>
+#include <cutlass/arch/barrier.h>
+#include <cutlass/cluster_launch.hpp>
+
+// CuTe includes
+#include <cute/tensor.hpp>                      // CuTe tensor implementation
+#include <cute/arch/cluster_sm90.hpp>           // CuTe functions for querying the details of cluster launched
+#include <cute/numeric/integral_constant.hpp>   // Compile time in constants such as _1, _256 etc.
+#include <cute/algorithm/cooperative_copy.hpp>
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 05: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// Key extensions to tutorial 04_mma_tma_2sm_sm100.cu:
+// 1. Demonstrate using TMA instructions in the epilogue
+//
+// This GEMM kernel will perform the following steps:
+// 1. Load A and B matrices from GMEM to SMEM using Multicasted TMA.2SM load operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using 2SM tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to shared memory (SMEM) with TMA.
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from shared memory (SMEM) to global memory (GMEM) with TMA.
+//
+// SM100 2SM tcgen05.mma instructions operate as follows:
+// - Mma is launched by only one SM
+//    With 2SM MMA instructions, only 1 of the 2 CTAs collaborating on MMA executes the instruction.
+//    We call the collaborating CTAs, peer CTAs. And the CTA executing the MMA instruction is called leader CTA.
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+//   and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+//   These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 256x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t;  // MMA A Data Type
+// TypeB = cutlass::half_t;  // MMA B Data Type
+// TypeC = float;            // MMA C Data Type
+// TypeD = float;            // MMA D Data Type
+// TypeAccumulator = float;  // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A, B, C, and D matrices.
+template <class TypeA,           // Tensor A data type
+          class TypeB,           // Tensor B data type
+          class TypeC,           // Tensor C data type
+          class TypeD,           // Tensor D data type
+          class ASmemLayout,     // (MmaA, NumMma_M, NumMma_K, ...)
+          class BSmemLayout,     // (MmaB, NumMma_N, NumMma_K, ...)
+          class CSmemLayout,     // EpiTile_M, EpiTile_N
+          class DSmemLayout>     // EpiTile_M, EpiTile_N
+struct SharedStorage
+{
+  alignas(128) union {
+    alignas(128) struct {
+      alignas(128) cute::ArrayEngine<TypeA, cute::cosize_v<ASmemLayout>> A;
+      alignas(128) cute::ArrayEngine<TypeB, cute::cosize_v<BSmemLayout>> B;
+    } mainloop;
+    alignas(128) cute::ArrayEngine<TypeC, cute::cosize_v<CSmemLayout>> C;
+    alignas(128) cute::ArrayEngine<TypeD, cute::cosize_v<DSmemLayout>> D;
+  } tensors;
+
+  alignas(16) cute::uint64_t mma_barrier;  // Barrier to track MMA computation on SMEM
+  alignas(16) cute::uint64_t tma_barrier;  // Barrier to track TMA data transfers to SMEM
+
+  CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(tensors.mainloop.A.begin()), ASmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(tensors.mainloop.B.begin()), BSmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sC() { return make_tensor(make_smem_ptr(tensors.C.begin()), CSmemLayout{}); }
+  CUTE_DEVICE constexpr auto tensor_sD() { return make_tensor(make_smem_ptr(tensors.D.begin()), DSmemLayout{}); }
+};
+
+// The device kernel
+template <class SharedStorage,
+          class ATensor, class BTensor, class CTensor, class DTensor,
+          class MmaTiler_MNK, class EpiTiler_MN, class TiledMMA, class ClusterShape_MNK,
+          class TmaAtomA, class TmaAtomB, class TmaAtomC, class TmaAtomD,
+          class Alpha, class Beta>
+__global__ static
+void
+gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
+            BTensor mB,                      // (Gemm_N, Gemm_K)
+            CTensor mC,                      // (Gemm_M, Gemm_N)
+            DTensor mD,                      // (Gemm_M, Gemm_N)
+            MmaTiler_MNK mma_tiler,          // <MmaTile_M, MmaTile_N, MmaTile_K>
+            EpiTiler_MN epi_tiler_mn,        // <EpiTile_M, EpiTile_N>
+            TiledMMA tiled_mma,              // <    Mma_M,     Mma_N,     Mma_K>
+            ClusterShape_MNK cluster_shape,  // (ClusterM, ClusterN, ClusterK)
+            CUTE_GRID_CONSTANT TmaAtomA const tma_atom_A,
+            CUTE_GRID_CONSTANT TmaAtomB const tma_atom_B,
+            CUTE_GRID_CONSTANT TmaAtomC const tma_atom_C,
+            CUTE_GRID_CONSTANT TmaAtomD const tma_atom_D,
+            Alpha alpha, Beta beta)
+{
+  // Step 1: The Prologue.
+
+  // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename TiledMMA::AtomThrID{}));
+
+  // Construct the MMA grid coordinate from the CTA grid coordinate
+  auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+                                   blockIdx.x / size<0>(cluster_layout_vmnk), //    MMA-M coordinate
+                                   blockIdx.y,                                //    MMA-N coordinate
+                                   _);                                        //    MMA-K coordinate
+
+  // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+  //   by this mma tile.
+  // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+  //   * Tensor to partition
+  //   * Tiler to use for partitioning
+  //   * Coordinate to use for slicing the partitioned tensor
+  //   * Projection to ignore unwanted modes of the Tiler and Coordinate
+  auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+  Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{});  // (MmaTile_M, MmaTile_K, Tiles_K)
+  Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{});  // (MmaTile_N, MmaTile_K, Tiles_K)
+  Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+  Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{});  // (MmaTile_M, MmaTile_N)
+
+  if (thread0()) {
+    print("mA:\t"); print(mA); print("\n");   // mA:   ArithTuple(_0,_0) o (512,256):(_1@1,_1@0)
+    print("mB:\t"); print(mB); print("\n");   // mB:   ArithTuple(_0,_0) o (1024,256):(_1@1,_1@0)
+    print("mC:\t"); print(mC); print("\n");   // mC:   gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+    print("mD:\t"); print(mD); print("\n");   // mD:   gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+    print("gA:\t"); print(gA); print("\n");   // gA:   ArithTuple(_0,0) o (_128,_64,4):(_1@1,_1@0,_64@0)
+    print("gB:\t"); print(gB); print("\n");   // gB:   ArithTuple(_0,0) o (_256,_64,4):(_1@1,_1@0,_64@0)
+    print("gC:\t"); print(gC); print("\n");   // gC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+    print("gD:\t"); print(gD); print("\n");   // gD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+  } __syncthreads();
+
+  // The SMEM tensors
+
+  // Allocate SMEM
+  extern __shared__ char shared_memory[];
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Represent the SMEM buffers for A and B
+  Tensor tCsA = shared_storage.tensor_sA();         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCsB = shared_storage.tensor_sB();         // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  //
+  // Mma partitioning for A and B
+  //
+
+  auto mma_v = get<0>(mma_coord_vmnk);
+  ThrMMA cta_mma = tiled_mma.get_slice(mma_v);   // Use Peer CTA coordinate
+  Tensor tCgA = cta_mma.partition_A(gA);         // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCgB = cta_mma.partition_B(gB);         // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+  Tensor tCgC = cta_mma.partition_C(gC);         // (MmaC, NumMma_M, NumMma_N)
+  Tensor tCgD = cta_mma.partition_C(gD);         // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCgA:\t"); print(tCgA); print("\n");  // tCgA:   ArithTuple(_0,0) o ((_128,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgB:\t"); print(tCgB); print("\n");  // tCgB:   ArithTuple(_0,0) o ((_256,_16),_1,_4,4):((_1@1,_1@0),_0,_16@0,_64@0)
+    print("tCgC:\t"); print(tCgC); print("\n");  // tCgC:   gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+    print("tCgD:\t"); print(tCgD); print("\n");  // tCgD:   gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+  } __syncthreads();
+
+  // MMA Fragment Allocation
+  // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+  // For tcgen05.mma operations:
+  // - Matrices A and B are sourced from SMEM
+  // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+  // - The first mode of each descriptor represents the SMEM for a single MMA operation
+  Tensor tCrA = cta_mma.make_fragment_A(tCsA);      // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+  Tensor tCrB = cta_mma.make_fragment_B(tCsB);      // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+  // TMEM Allocation
+  // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+  // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+  Tensor tCtAcc = cta_mma.make_fragment_C(tCgC);    // (MmaC, NumMma_M, NumMma_N)
+
+  if (thread0()) {
+    print("tCsA:\t"); print(tCsA); print("\n");     // tCsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCsB:\t"); print(tCsB); print("\n");     // tCsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+    print("tCrA:\t"); print(tCrA); print("\n");     // tCrA:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCrB:\t"); print(tCrB); print("\n");     // tCrB:   UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+    print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+  } __syncthreads();
+
+  // TMA Setup
+  //
+  //   These are TMA partitionings, which have a dedicated custom partitioner.
+  //   In this example, the TMA multicasts the loads across multiple CTAs.
+  //   Loads of A are multicasted along the N dimension of the cluster_shape_VMNK and
+  //   Loads of B are multicasted along the M dimension of the cluster_shape_VMNK.
+  //      Any multicasting must be in conformance with tma_x constructed with make_tma_atom on host.
+  //   For A tensor: The group_modes<0,3> transforms the (MmaA, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaA, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile MK.
+  //   For B tensor: The group_modes<0,3> transforms the (MmaB, NumMma_M, NumMma_K, Tiles_K)-shaped tensor
+  //      into ((MmaB, NumMma_M, NumMma_K), Tiles_K). The partitioning only pays attention to mode-0, the MMA Tile NK.
+  //   Simply put, the TMA will be responsible for everything in mode-0 with a single call to cute::copy.
+  //   The tma_partition reorders and offsets mode-0 according to the tma_x atom and the multicast info.
+
+  // Each CTA with the same m-coord will load a portion of A
+  // Each CTA with the same n-coord will load a portion of B
+  // Computation of the multicast masks must take into account the Peer CTA for TMA.2SM
+
+  // Construct the CTA-in-Cluster coordinate for multicasting
+  auto cta_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(int(cute::block_rank_in_cluster()));
+
+  // Project the cluster_layout for tma_A along the N-modes
+  auto [tAgA, tAsA] = tma_partition(tma_atom_A,
+                                    get<2>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along N mode of the cluster
+                                    make_layout(size<2>(cluster_layout_vmnk)),  // The CTA layout along N mode of the cluster
+                                    group_modes<0,3>(tCsA), group_modes<0,3>(tCgA));
+
+  // Project the cluster_layout for tma_B along the M-modes
+  auto [tBgB, tBsB] = tma_partition(tma_atom_B,
+                                    get<1>(cta_in_cluster_coord_vmnk),          // The CTA coordinate along M mode of the cluster
+                                    make_layout(size<1>(cluster_layout_vmnk)),  // The CTA layout along M mode of the cluster
+                                    group_modes<0,3>(tCsB), group_modes<0,3>(tCgB));
+
+  // Project the cluster_layout and cta_coord along the N-mode to determine the multicast mask for A
+  uint16_t tma_mcast_mask_a = create_tma_multicast_mask<2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the M-mode to determine the multicast mask for B
+  uint16_t tma_mcast_mask_b = create_tma_multicast_mask<1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+  // Project the cluster_layout and cta_coord along the VM + VN-modes to determine the multicast mask for C
+  uint16_t mma_mcast_mask_c = create_tma_multicast_mask<0,1>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk) |
+                              create_tma_multicast_mask<0,2>(cluster_layout_vmnk, cta_in_cluster_coord_vmnk);
+
+  // Calculate total bytes that TMA will transfer each tile to track completion, accounting for TMA.2SM
+  int tma_transaction_bytes = size<0>(cluster_layout_vmnk) * sizeof(make_tensor_like(tAsA))
+                            + size<0>(cluster_layout_vmnk) * sizeof(make_tensor_like(tBsB));
+
+  if (thread0()) {
+    print("tAgA:\t"); print(tAgA); print("\n");  // tAgA:   ArithTuple(_0,0) o (((_64,_128),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tAsA:\t"); print(tAsA); print("\n");  // tAsA:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_8192,_1)):((_1,_0))
+    print("tBgB:\t"); print(tBgB); print("\n");  // tBgB:   ArithTuple(_0,0) o (((_64,_256),_1),4):(((_1@0,_1@1),_0),_64@0)
+    print("tBsB:\t"); print(tBsB); print("\n");  // tBsB:   Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_16384,_1)):((_1,_0))
+    printf("tma_transaction_bytes: %d\n", tma_transaction_bytes);
+    printf("tma_mcast_mask_a: %x\n", tma_mcast_mask_a);
+    printf("tma_mcast_mask_b: %x\n", tma_mcast_mask_b);
+    printf("mma_mcast_mask_c: %x\n", mma_mcast_mask_c);
+  } __syncthreads();
+
+  // Barrier Initialization
+  auto elect_one_thr  = cute::elect_one_sync();
+  auto elect_one_warp = (threadIdx.x / 32 == 0);
+  auto elect_one_cta  = get<0>(cta_in_cluster_coord_vmnk) == Int<0>{};
+
+  // Barriers in SMEM should be initialized by a single thread.
+  if (elect_one_warp && elect_one_thr) {
+    // The number of CTAs that participates in multicast operation with this CTA (for both A and B matrices)
+    int num_mcast_participants = size<1>(cluster_layout_vmnk) + size<2>(cluster_layout_vmnk) - 1;
+    cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ num_mcast_participants);
+    cute::initialize_barrier(shared_storage.tma_barrier, /* num_threads */ 1);
+  }
+  int mma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  int tma_barrier_phase_bit = 0;  // Each barrier has an associated phase_bit.
+  cute::cluster_sync();           // Make sure all CTAs in Cluster observe barrier init and TMEM alloc.
+
+  // Step 2: The Mainloop.
+
+  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+  for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+  {
+    // Step 2a: Load A and B tiles
+
+    // TMA Load Operations:
+    // - Execute asynchronous TMA loads with single thread
+    // - Both peer and leader CTAs initiate TMA loads
+    // - Set expected transaction bytes. For 2SM TMA instructions, the transaction bytes counts both CTAs.
+    // - Although TMAs are initiated by both peer and leader CTAs, the barrier is only set and waited by the leader CTA.
+    // - Initiate asynchronous transfers with a multicast mask that includes all CTAs that participate in multicast.
+    if (elect_one_warp && elect_one_thr) { // TMA loads are executed by one thread
+      if (elect_one_cta) { // Only the leader CTA waits for TMA transactions
+        cute::set_barrier_transaction_bytes(shared_storage.tma_barrier, tma_transaction_bytes); // Set the expected transaction bytes for the TMA loads
+      }
+      copy(tma_atom_A.with(shared_storage.tma_barrier,tma_mcast_mask_a), tAgA(_,k_tile), tAsA); // Load MmaTile_M x MmaTile_K A tile
+      copy(tma_atom_B.with(shared_storage.tma_barrier,tma_mcast_mask_b), tBgB(_,k_tile), tBsB); // Load MmaTile_N x MmaTile_K B tile
+    }
+
+    // Step 2b: Execute the MMAs for this tile
+
+    if (elect_one_cta) {
+      // Wait for TMA loads to complete on leader CTAs
+      cute::wait_barrier(shared_storage.tma_barrier, tma_barrier_phase_bit);
+      tma_barrier_phase_bit ^= 1;
+
+      // tcgen05.mma instructions require single-thread execution:
+      // - Only one warp performs the MMA-related loop operations
+      // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+      // - No explicit elect_one_sync region is needed from the user
+      if (elect_one_warp) {
+        // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+            gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+            tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+        // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+        cutlass::arch::umma_arrive_multicast_2x1SM(&shared_storage.mma_barrier, mma_mcast_mask_c); // All multicasting CTAs encoded in mask.
+      }
+    }
+    // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+    cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+    mma_barrier_phase_bit ^= 1;
+  }
+
+  // Step 3: The Epilogue.
+
+  // Apply rank-2 epilogue tiler to rank-2 MMA_V mode
+  auto epi_tiler_v = make_tile(epi_tiler_mn);               // (EpiTile)
+  Tensor tAcc_epi = zipped_divide(tCtAcc, epi_tiler_v);     // (EpiTile,NumTiles)
+  Tensor gC_epi   = zipped_divide(tCgC,   epi_tiler_v);     // (EpiTile,NumTiles)
+  Tensor gD_epi   = zipped_divide(tCgD,   epi_tiler_v);     // (EpiTile,NumTiles)
+
+  // Construct corresponding SMEM tensors
+  Tensor sC_epi = shared_storage.tensor_sC();               // (EpiTile)
+  Tensor sD_epi = shared_storage.tensor_sD();               // (EpiTile)
+
+  // Partition for TMA
+  auto [tGS_gC, tGS_sC] = tma_partition(tma_atom_C, sC_epi, gC_epi); // (GMEM -> SMEM)
+  auto [tSG_gD, tSG_sD] = tma_partition(tma_atom_D, sD_epi, gD_epi); // (SMEM -> GMEM)
+
+  // Reset transaction bytes for C load
+  tma_transaction_bytes = sizeof(make_tensor_like(tGS_sC));
+
+  // Partition for TMEM accumulators load (TMEM -> RMEM)
+  TiledCopy t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tAcc_epi(_,_0{}));
+  ThrCopy   thr_t2r  = t2r_copy.get_slice(threadIdx.x);
+  Tensor tTR_tAcc = thr_t2r.partition_S(tAcc_epi);          // (TmemCpy,NumTmemCpy,NumTiles)
+  Tensor tTR_sC   = thr_t2r.partition_D(sC_epi);            // (TmemCpy,NumTmemCpy)
+  Tensor tTR_sD   = thr_t2r.partition_D(sD_epi);            // (TmemCpy,NumTmemCpy)
+  // Allocate register tensors
+  Tensor tTR_rC = make_tensor_like(tTR_sC);                 // (TmemCpy,NumTmemCpy)
+  Tensor tTR_rD = make_fragment_like(tTR_sD);               // (TmemCpy,NumTmemCpy)
+
+  // Loop over the epilogue tiles
+  CUTE_UNROLL
+  for (int epi_tile_idx = 0; epi_tile_idx < size<2>(tTR_tAcc); ++epi_tile_idx) {
+    // TMA Load C:  GMEM -> SMEM
+    if (elect_one_warp && elect_one_thr) {
+      cute::set_barrier_transaction_bytes(shared_storage.tma_barrier, tma_transaction_bytes);
+      copy(tma_atom_C.with(shared_storage.tma_barrier, 0 /*no multicast*/), tGS_gC(_,epi_tile_idx), tGS_sC);
+    }
+    // All threads wait for C TMA load to complete
+    cute::wait_barrier(shared_storage.tma_barrier, tma_barrier_phase_bit);
+    tma_barrier_phase_bit ^= 1;
+
+    // Load C:  SMEM -> RMEM
+    copy_aligned(tTR_sC, tTR_rC);
+
+    // Load Acc:  TMEM -> RMEM
+    copy(t2r_copy, tTR_tAcc(_,_,epi_tile_idx), tTR_rD);
+
+    // Compute D = beta * C + alpha * (A*B)
+    axpby(beta, tTR_rC, alpha, tTR_rD);
+
+    // Store D:  RMEM -> SMEM
+    __syncthreads(); // Ensure C loads are finished before reusing smem (unnecessary if smem layouts match)
+    copy_aligned(tTR_rD, tTR_sD);
+
+    // TMA Store D:  SMEM -> GMEM
+    tma_store_fence(); // Ensure D smem stores are visible to TMA
+    __syncthreads(); // Ensure all threads have issued fence
+    if (elect_one_warp && elect_one_thr) {
+      copy(tma_atom_D, tSG_sD, tSG_gD(_,epi_tile_idx));
+      tma_store_arrive(); // issuing thread commits D TMA store
+      tma_store_wait<0>(); // issuing thread waits for D TMA store to complete
+    }
+    __syncthreads(); // All threads sync with issuing thread
+  }
+}
+
+template <class TypeA, class LayoutA,
+          class TypeB, class LayoutB,
+          class TypeC, class LayoutC,
+          class TypeD, class LayoutD,
+          class Alpha, class Beta>
+void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
+                                   TypeB const* device_ptr_B, LayoutB layout_B,
+                                   TypeC const* device_ptr_C, LayoutC layout_C,
+                                   TypeD      * device_ptr_D, LayoutD layout_D,
+                                   Alpha const alpha, Beta const beta)
+{
+  assert(shape<0>(layout_A) == shape<0>(layout_C));  // Gemm_M
+  assert(shape<0>(layout_A) == shape<0>(layout_D));  // Gemm_M
+  assert(shape<0>(layout_B) == shape<1>(layout_C));  // Gemm_N
+  assert(shape<0>(layout_B) == shape<1>(layout_D));  // Gemm_N
+  assert(shape<1>(layout_A) == shape<1>(layout_B));  // Gemm_K
+
+  // Represent the full tensors in global memory
+  Tensor mA = make_tensor(make_gmem_ptr(device_ptr_A), layout_A);      // (Gemm_M, Gemm_K)
+  Tensor mB = make_tensor(make_gmem_ptr(device_ptr_B), layout_B);      // (Gemm_N, Gemm_K)
+  Tensor mC = make_tensor(make_gmem_ptr(device_ptr_C), layout_C);      // (Gemm_M, Gemm_N)
+  Tensor mD = make_tensor(make_gmem_ptr(device_ptr_D), layout_D);      // (Gemm_M, Gemm_N)
+
+  // Get M, N, K dimensions of the GEMM we are running
+  auto Gemm_M = shape<0>(layout_A);
+  auto Gemm_N = shape<0>(layout_B);
+  auto Gemm_K = shape<1>(layout_A);
+  std::cout << "Running for problem shape (MxNxK): " << Gemm_M << "x" << Gemm_N << "x" << Gemm_K << std::endl;
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Initialize the GEMM kernel parameters
+  //
+  ////////////////////////////////////////////////////////////
+
+  // Create TiledMma. make_tiled_mma takes the target instructions and an (optional) instruction layout as parameters to create a
+  // larger TiledMma from the given mma instruction.
+  // See cute/arch/mma_sm100_umma.hpp for all tcgen05.mma instructions
+  TiledMMA tiled_mma = make_tiled_mma(SM100_MMA_F16BF16_2x1SM_SS<TypeA, TypeB, TypeC,                 // Mma's A, B, and Accumulator types
+                                                                 256, 256,                            // Mma M and N dimensions
+                                                                 UMMA::Major::K, UMMA::Major::K>{});  // A and B layouts
+
+  // We can also print and inspect the tiled_mma
+  print(tiled_mma);
+  // TiledMMA
+  //   ThrLayoutVMNK:  (_2,_1,_1,_1):(_1,_0,_0,_0)
+  //   PermutationMNK: (_,_,_)
+  // MMA_Atom
+  //   ThrID:      _2:_1
+  //   Shape_MNK:  (_256,_256,_16)                      // MmaM, MmaN, MmaK (MmaK is constant for each instr.)
+  //   LayoutA_TV: (_2,(_128,_16)):(_128,(_1,_256))     // TV -> MmaCoordinate mapping for A matrix
+  //   LayoutB_TV: (_2,(_128,_16)):(_128,(_1,_256))     // TV -> MmaCoordinate mapping for B matrix
+  //   LayoutC_TV: (_2,(_128,_256)):(_128,(_1,_256))    // TV -> MmaCoordinate mapping for B matrix
+
+  // Define MMA tiler sizes (static)
+  auto bM = tile_size<0>(tiled_mma);             // MMA Tile M. We'll use 1 MMAs per MMA Tile M.
+  auto bN = tile_size<1>(tiled_mma);             // MMA Tile N. We'll use 1 MMAs per MMA Tile M.
+  auto bK = tile_size<2>(tiled_mma) * Int<4>{};  // MMA Tile K. We'll use 4 MMAs per MMA Tile K. For 16b types, tcgen05.mma has K16.
+  auto mma_tiler = make_shape(bM, bN, bK);       // (MMA_M, MMA_N, MMA_K)
+
+  // In SM90,  the MMAs are CTA-local and perform thread-level partitioning.
+  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
+  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
+  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
+  //  The MMA's partitioning then yeilds the CTA-local work.
+
+  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
+    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
+    return;
+  }
+
+  if (not evenly_divides(make_shape(Gemm_M, Gemm_N, Gemm_K), mma_tiler)) {
+    std::cerr << "OOB accesses are not supported. MmaTiler_MNK should evenly divide ProblemShape_MNK." << std::endl;
+    return;
+  }
+
+  //
+  // Determine the SMEM layouts:
+  //
+
+  //  * SMEM layouts for A and B must match the post-partitioned (CTA-local) shapes expected by the MMA instructions.
+  //  * CuTe provides partition_shape_[A|B] functions to determine the post-partitioned shape.
+  //    These functions take the TiledMma, and the MMA Tile Shape as inputs and returns a shape that is at least rank-3
+  //    where the first mode has the same shape as the MMA instruction, 2nd and 3rd mode expresses the number of time
+  //    MMA instr is repeated in M/N mode and K mode of MMA tile, respectively.
+  //  * Note that SMEM layouts are needed to determine SMEM allocation for kernel launch.
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_K) to post-partitioned (MmaA, NumMma_M, NumMma_K)
+  auto mma_shape_A = partition_shape_A(tiled_mma, make_shape(size<0>(mma_tiler), size<2>(mma_tiler)));
+  // Pre-partitioned Tile Shape (MmaTile_N, MmaTile_K) to post-partitioned (MmaB, NumMma_N, NumMma_K)
+  auto mma_shape_B = partition_shape_B(tiled_mma, make_shape(size<1>(mma_tiler), size<2>(mma_tiler)));
+
+  // Print and inspect mma_shape_A, and mma_shape_B for this example.
+  print("mma_shape_A:\t"); print(mma_shape_A); print("\n");  // mma_shape_A:  ((_128,_16),_1,_4)
+  print("mma_shape_B:\t"); print(mma_shape_B); print("\n");  // mma_shape_B:  ((_256,_16),_1,_4)
+
+  // A and B tensors are swizzled in SMEM to improve MMA performance.
+  //  * However, expressing swizzled layouts is very hard.
+  //  * CuTe provides tile_to_mma_shape functions for SM100 to create swizzled layouts for post-partitioned Mma Shapes
+  auto sA_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeA>{}, mma_shape_A);
+  auto sB_layout = UMMA::tile_to_mma_shape(UMMA::Layout_K_SW128_Atom<TypeB>{}, mma_shape_B);
+
+  // Print and inspect sA_layout and sB_layout for this example.
+  print("sA_layout:\t"); print(sA_layout); print("\n");      // sA_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+  print("sB_layout:\t"); print(sB_layout); print("\n");      // sB_layout:   Sw<3,4,3> o smem_ptr[16b](unset) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+
+  //
+  // Epilogue parameters
+  //
+
+  // Pre-partitioned Tile Shape (MmaTile_M, MmaTile_N) to post-partitioned ((MmaM,MmaN), NumMma_M, NumMma_N)
+  auto mma_shape_C = partition_shape_C(tiled_mma, make_shape(size<0>(mma_tiler), size<1>(mma_tiler)));
+
+  // For TMA epilogue performance it may be beneficial to iterate over the output in smaller tiles than the MMA tile
+  auto epi_tiler = make_tile(size<0,0>(mma_shape_C), size<0,1>(mma_shape_C) / Int<4>{});  // 4 TMA copies per CTA per MMA tile
+
+  // SMEM layouts for C and D should match the epilogue tile
+  auto sC_layout_mn = tile_to_shape(UMMA::Layout_K_SW128_Atom<TypeC>{}, // MMA K-major is equivalent to epilogue N-major
+                                    make_shape(size<0>(epi_tiler), size<1>(epi_tiler)));
+  auto sC_layout = group<0,2>(sC_layout_mn); // Group modes for tma_partition
+
+  auto sD_layout_mn = tile_to_shape(UMMA::Layout_K_SW128_Atom<TypeD>{}, // MMA K-major is equivalent to epilogue N-major
+                                    make_shape(size<0>(epi_tiler), size<1>(epi_tiler)));
+  auto sD_layout = group<0,2>(sD_layout_mn); // Group modes for tma_partition
+
+  print("sC_layout:\t"); print(sC_layout); print("\n");      // sC_layout:   Sw<3,4,3> o smem_ptr[32b](unset) o ((_8,_16),(_32,_2)):((_32,_256),(_1,_4096))
+  print("sD_layout:\t"); print(sD_layout); print("\n");      // sD_layout:   Sw<3,4,3> o smem_ptr[32b](unset) o ((_8,_16),(_32,_2)):((_32,_256),(_1,_4096))
+
+  // Now we can find the SMEM allocation size
+  using SMEMStorage = SharedStorage<TypeA, TypeB, TypeC, TypeD,
+                                    decltype(sA_layout), decltype(sB_layout),
+                                    decltype(sC_layout), decltype(sD_layout)>;
+
+  //
+  // TMA Descriptor Creation (Host Side)
+  //
+
+  // The cluster shape and layout
+  auto cluster_shape = make_shape(Int<4>{}, Int<4>{}, Int<1>{});
+  Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+                                            make_tile(typename decltype(tiled_mma)::AtomThrID{}));
+
+  // SM100 interface for creating TMA loads.
+  Copy_Atom tma_atom_A = make_tma_atom_A_sm100(
+      SM100_TMA_2SM_LOAD_MULTICAST{}, // TMA load operation -- Multicasting 2SM instruction.
+      mA,                             // Source GMEM tensor
+      sA_layout,                      // Destination SMEM layout
+      mma_tiler,                      // MmaTiler_MNK. Unlike Sm90 interface where the tiler only included M and K modes.
+      tiled_mma,                      // Sm100 also requires the TiledMma to perform CTA-level partitioning.
+      cluster_layout_vmnk);           // ClusterLayout_VMNK. Unlike Sm90 interface where only the multicasting mode is passed.
+                                      //   We have make_tma_atom_[A|B]_sm100 and which determines the multicast mode.
+  Tensor mA_tma = tma_atom_A.get_tma_tensor(shape(mA));   // (Gemm_M, Gemm_K)
+
+  print("tma_atom_A:\t"); print(tma_atom_A); print("\n");
+  // tma_atom_A:     Copy_Atom
+  //  ThrID:        _2:_1
+  //  ValLayoutSrc: (_2,_8192):(_8192,_1)
+  //  ValLayoutDst: (_2,_8192):(_8192,_1)
+  //  ValLayoutRef: (_2,_8192):(_8192,_1)
+  //  ValueType:    16b
+
+  // SM100 interface for creating TMA loads.
+  Copy_Atom tma_atom_B = make_tma_atom_B_sm100(
+    SM100_TMA_2SM_LOAD_MULTICAST{}, // TMA load operation -- Multicasting 2SM instruction.
+    mB,                             // Source GMEM tensor
+    sB_layout,                      // Destination SMEM layout
+    mma_tiler,                      // MmaTiler_MNK. Unlike Sm90 interface where the tiler only included M and K modes.
+    tiled_mma,                      // Sm100 also requires the TiledMma to perform CTA-level partitioning.
+    cluster_layout_vmnk);           // ClusterLayout_VMNK. Unlike Sm90 interface where only the multicasting mode is passed.
+                                    //   We have make_tma_atom_[A|B]_sm100 and which determines the multicast mode.
+  Tensor mB_tma = tma_atom_B.get_tma_tensor(shape(mB));   // (Gemm_N, Gemm_K)
+
+  print("tma_atom_B:\t"); print(tma_atom_B); print("\n");
+  // tma_atom_B:     Copy_Atom
+  // ThrID:        _2:_1
+  // ValLayoutSrc: (_2,_8192):(_8192,_1)
+  // ValLayoutDst: (_2,_8192):(_8192,_1)
+  // ValLayoutRef: (_2,_8192):(_8192,_1)
+  // ValueType:    16b
+
+  Copy_Atom tma_atom_C = make_tma_atom(
+        SM90_TMA_LOAD{},            // TMA load operation
+        mC,                         // Source GMEM tensor
+        sC_layout,                  // Destination SMEM layout
+        epi_tiler);                 // MN Tiler for epilogue
+  Tensor mC_tma = tma_atom_C.get_tma_tensor(shape(mC));   // (Gemm_M, Gemm_N)
+
+  print("tma_atom_C:\t"); print(tma_atom_C); print("\n");
+  // tma_atom_C:     Copy_Atom
+  //   ThrID:        _1:_0
+  //   ValLayoutSrc: (_1,_4096):(_0,_1)
+  //   ValLayoutDst: (_1,_4096):(_0,_1)
+  //   ValLayoutRef: (_1,_4096):(_0,_1)
+  //   ValueType:    32b
+
+  Copy_Atom tma_atom_D = make_tma_atom(
+        SM90_TMA_STORE{},           // TMA store operation
+        mD,                         // Destination GMEM tensor
+        sD_layout,                  // Source SMEM layout
+        epi_tiler);                 // MN Tiler for epilogue
+  Tensor mD_tma = tma_atom_D.get_tma_tensor(shape(mD));   // (Gemm_M, Gemm_N)
+
+  print("tma_atom_D:\t"); print(tma_atom_D); print("\n");
+  // tma_atom_D:     Copy_Atom
+  //   ThrID:        _1:_0
+  //   ValLayoutSrc: (_1,_4096):(_0,_1)
+  //   ValLayoutDst: (_1,_4096):(_0,_1)
+  //   ValLayoutRef: (_1,_4096):(_0,_1)
+  //   ValueType:    32b
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Launch GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  dim3 dimBlock(128);
+  dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
+  dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
+               round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SMEMStorage);
+
+  auto* kernel_ptr = &gemm_device<SMEMStorage,
+                                  decltype(mA_tma), decltype(mB_tma), decltype(mC_tma), decltype(mD_tma),
+                                  decltype(mma_tiler), decltype(epi_tiler), decltype(tiled_mma), decltype(cluster_shape),
+                                  decltype(tma_atom_A), decltype(tma_atom_B), decltype(tma_atom_C), decltype(tma_atom_D), // Includes the TMA descriptor.
+                                  Alpha, Beta>;
+
+  // Set kernel attributes (set SMEM)
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  printf("Grid launched: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
+  printf("Cluster launched: %d, %d, %d\n", dimCluster.x, dimCluster.y, dimCluster.z);
+
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             mA_tma, mB_tma, mC_tma, mD_tma,
+                                                             mma_tiler, epi_tiler, tiled_mma, cluster_shape,
+                                                             tma_atom_A, tma_atom_B, tma_atom_C, tma_atom_D,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if ((props.major != 10) || (props.major == 10 && props.minor > 1)) {
+    std::cerr << "This example requires NVIDIA's Blackwell Architecture GPU with compute capability 100a." << std::endl;
+    std::cerr << "  Found " << props.major << "." << props.minor << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+  int Gemm_M = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &Gemm_M);
+
+  int Gemm_N = 1024;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &Gemm_N);
+
+  int Gemm_K = 256;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &Gemm_K);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Create A, B, C, and D tensors
+  //
+  ////////////////////////////////////////////////////////////
+  // Define the data types. A and B types are same for MMA instruction.
+  using TypeA = cutlass::half_t; // MMA A Data Type
+  auto type_str_a = "half_t";
+  using TypeB = cutlass::half_t; // MMA B Data Type
+  auto type_str_b = "half_t";
+  using TypeC = float;           // MMA C Data Type
+  [[maybe_unused]] auto type_str_c = "float";
+  using TypeD = float;           // MMA D Data Type
+  auto type_str_d = "float";
+  using TypeAccumulator = float; // Both TypeC and TypeD are float, use float accumulator type.
+
+  // A tensor MxK K-major (Layout T = Row-Major)
+  Layout layout_A = make_layout(make_shape (Gemm_M,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_M,Gemm_K):(Gemm_K,_1)
+  // B tensor NxK K-major (Layout N = Column-Major)
+  Layout layout_B = make_layout(make_shape (Gemm_N,   Gemm_K),
+                                make_stride(Gemm_K, Int<1>{}));   // (Gemm_N,Gemm_K):(Gemm_K,_1)
+  // C tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_C = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+  // D tensor MxN N-major (Layout T = Row-Major)
+  Layout layout_D = make_layout(make_shape (Gemm_M,   Gemm_N),
+                                make_stride(Gemm_N, Int<1>{}));   // (Gemm_M,Gemm_N):(Gemm_N,_1)
+
+  // Host allocations and host CuTe tensors for A, B, and C tensors.
+  thrust::host_vector<TypeA>   host_A(Gemm_M * Gemm_K);
+  Tensor host_tensor_A = make_tensor(host_A.data(), layout_A);
+  print("host_tensor_A:\t"); print(host_tensor_A); print("\n"); // host_tensor_A:	ptr[16b](ADDR_A) o (512,256):(256,_1)
+
+  thrust::host_vector<TypeB>   host_B(Gemm_N * Gemm_K);
+  Tensor host_tensor_B = make_tensor(host_B.data(), layout_B);
+  print("host_tensor_B:\t"); print(host_tensor_B); print("\n"); // host_tensor_B:	ptr[16b](ADDR_B) o (1024,256):(256,_1)
+
+  thrust::host_vector<TypeC>   host_C(Gemm_M * Gemm_N);
+  Tensor host_tensor_C = make_tensor(host_C.data(), layout_C);
+  print("host_tensor_C:\t"); print(host_tensor_C); print("\n"); // host_tensor_C:	ptr[32b](ADDR_C) o (512,1024):(1024,_1)
+
+  // Note that we don't need a host_tensor for D yet.
+  thrust::device_vector<TypeD> device_D(Gemm_M * Gemm_N);
+
+  // Initialize A, B, and C tensors with random values.
+  initialize_tensor(host_tensor_A);
+  initialize_tensor(host_tensor_B);
+  initialize_tensor(host_tensor_C);
+
+  // Copy A, B, and C tensors from host memory to device memory
+  thrust::device_vector<TypeA> device_A = host_A;
+  thrust::device_vector<TypeB> device_B = host_B;
+  thrust::device_vector<TypeC> device_C = host_C;
+
+  using Alpha = float;
+  using Beta = float;
+  Alpha alpha = 1.0f;
+  Beta beta = 0.0f;
+  // Setup input and output tensors, and the kernel parameters; and execute the kernel on device
+  gemm_host_f16xf16_f32_f32_tnt(device_A.data().get(), layout_A,
+                                device_B.data().get(), layout_B,
+                                device_C.data().get(), layout_C,
+                                device_D.data().get(), layout_D,
+                                alpha, beta);
+  // Host allocation for D tensor and transfer D tensor from device to host
+  thrust::host_vector<TypeD> host_D = device_D;
+  // Create a non-owning CuTe tensor for D tensor
+  Tensor host_tensor_D = make_tensor(host_D.data(), layout_D);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Execute reference GEMM kernel
+  //
+  ////////////////////////////////////////////////////////////
+
+  thrust::host_vector<TypeD> host_reference_D(Gemm_M*Gemm_N);
+  auto host_reference_tensor_D = make_tensor(host_reference_D.data(), layout_D);
+  reference_gemm<TypeAccumulator>(host_tensor_A, host_tensor_B, host_tensor_C, host_reference_tensor_D, alpha, beta);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // Compare results
+  //
+  ////////////////////////////////////////////////////////////
+
+  auto relative_error = print_matrix_multiply_mollified_relative_error(type_str_a, host_tensor_A,
+                                                                       type_str_b, host_tensor_B,
+                                                                       type_str_d, host_tensor_D, host_reference_tensor_D);
+  bool success = relative_error <= 0.0;
+  std::cout << "Execution is " << ((success) ? "successful." : "failed.") << std::endl;
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/blackwell/CMakeLists.txt b/examples/cute/tutorial/blackwell/CMakeLists.txt
new file mode 100644
index 00000000..35db1ec4
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/CMakeLists.txt
@@ -0,0 +1,54 @@
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+cutlass_example_add_executable(
+  cute_tutorial_01_mma_sm100
+  01_mma_sm100.cu
+)
+
+cutlass_example_add_executable(
+  cute_tutorial_02_mma_tma_sm100
+  02_mma_tma_sm100.cu
+)
+
+cutlass_example_add_executable(
+  cute_tutorial_03_mma_tma_multicast_sm100
+  03_mma_tma_multicast_sm100.cu
+)
+
+cutlass_example_add_executable(
+  cute_tutorial_04_mma_tma_2sm_sm100
+  04_mma_tma_2sm_sm100.cu
+)
+
+cutlass_example_add_executable(
+  cute_tutorial_05_mma_tma_epi_sm100
+  05_mma_tma_epi_sm100.cu
+)
+endif()
diff --git a/examples/cute/tutorial/blackwell/example_utils.hpp b/examples/cute/tutorial/blackwell/example_utils.hpp
new file mode 100644
index 00000000..f6332002
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/example_utils.hpp
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cute/tensor.hpp>                    // CuTe tensor implementation
+#include <cute/arch/copy_sm90_desc.hpp>
+
+template <class AccType,
+          class TensorA, class TensorB,
+          class TensorC, class TensorD,
+          class Alpha, class Beta>
+void
+reference_gemm(TensorA const& tensor_A, TensorB const& tensor_B,
+               TensorC const& tensor_C, TensorD      & tensor_D,
+               Alpha alpha, Beta beta)
+{
+  using namespace cute;
+  for (int m = 0; m < size<0>(tensor_D); ++m) {
+    for (int n = 0; n < size<1>(tensor_D); ++n) {
+      AccType c = AccType(0.f);
+      for (int k = 0; k < size<1>(tensor_A); ++k) {
+        c += tensor_A(m,k) * tensor_B(n,k);
+      }
+      tensor_D(m,n) = alpha * c + beta * tensor_C(m,n);
+    }
+  }
+}
+
+template <class TensorA, class TensorB,
+          class TensorC, class TensorD,
+          class RefTensorD>
+bool
+compare_results(TensorA const& tensor_A, TensorB const& tensor_B,
+                TensorC const& tensor_C, TensorD const& tensor_D,
+                RefTensorD const& ref_tensor_D,
+                bool print_diff = false)
+{
+  using namespace cute;
+  auto norm_A = matrix_inf_norm(tensor_A);
+  auto norm_B = matrix_inf_norm(tensor_B);
+  auto norm_C = matrix_inf_norm(tensor_C);
+  auto norm_D = matrix_inf_norm(tensor_D);
+  auto norm_ref_D = matrix_inf_norm(ref_tensor_D);
+  auto norm_diff = matrix_diff_inf_norm(tensor_D, ref_tensor_D);
+
+  if (print_diff) {
+    for (int m = 0; m < size<0>(tensor_D); ++m) {
+      for (int n = 0; n < size<1>(tensor_D); ++n) {
+        std::cout << m << "," << n << " : " << tensor_D(m,n) << " vs. " << ref_tensor_D(m,n) << std::endl;
+      }
+    }
+  }
+
+  std::cout << "norm (A)       : " << norm_A.inf_norm << std::endl;
+  std::cout << "norm (B)       : " << norm_B.inf_norm << std::endl;
+  std::cout << "norm (C)       : " << norm_C.inf_norm << std::endl;
+  std::cout << "norm (D)       : " << norm_D.inf_norm << std::endl;
+  std::cout << "norm (ref_D)   : " << norm_ref_D.inf_norm << std::endl;
+  std::cout << "norm (D-ref_D) : " << norm_diff.inf_norm << std::endl;
+
+  return (!norm_A.found_nan) && (!norm_B.found_nan) &&
+         (!norm_C.found_nan) && (!norm_D.found_nan) && (!norm_ref_D.found_nan) &&                 // There are no NaNs
+         (norm_A.inf_norm > 0.0) && (norm_B.inf_norm > 0.0) &&
+         (norm_C.inf_norm > 0.0) && (norm_D.inf_norm > 0.0) && (norm_ref_D.inf_norm > 0.0) &&     // Values in tensors aren't zeros
+         (norm_diff.inf_norm <= 0.0);                                                             // Diff (ref_D-D) == 0
+}
+
+template <class Tensor>
+void
+initialize_tensor(Tensor& tensor, cute::tuple<int, int> value_range = {-2, 2})
+{
+  using DataType = typename Tensor::element_type;
+  auto [min, max] = value_range;
+  for (int i = 0; i < cute::size(tensor); i++) {
+    tensor(i) = DataType(int((max-min)*(rand() / double(RAND_MAX)) + min));
+  }
+}
diff --git a/examples/cute/tutorial/hopper/CMakeLists.txt b/examples/cute/tutorial/hopper/CMakeLists.txt
new file mode 100644
index 00000000..7498090e
--- /dev/null
+++ b/examples/cute/tutorial/hopper/CMakeLists.txt
@@ -0,0 +1,38 @@
+
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  cute_tutorial_wgmma_sm90
+  wgmma_sm90.cu
+)
+
+cutlass_example_add_executable(
+  cute_tutorial_wgmma_tma_sm90
+  wgmma_tma_sm90.cu
+)
diff --git a/examples/cute/tutorial/hopper/wgmma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_sm90.cu
new file mode 100644
index 00000000..405bb310
--- /dev/null
+++ b/examples/cute/tutorial/hopper/wgmma_sm90.cu
@@ -0,0 +1,611 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/cluster_launch.hpp"
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+using namespace cute;
+
+template <class ElementA,
+          class ElementB,
+          class SmemLayoutA,  // (M,K,P)
+          class SmemLayoutB>  // (N,K,P)
+struct SharedStorage
+{
+  alignas(128) cute::ArrayEngine<ElementA, cosize_v<SmemLayoutA>> A;
+  alignas(128) cute::ArrayEngine<ElementB, cosize_v<SmemLayoutB>> B;
+};
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class TiledCopyA,
+          class TB, class BStride, class BSmemLayout, class TiledCopyB,
+          class TC, class CStride, class TiledMma,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(TiledMma{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
+            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
+            TC      * C, CStride dC, TiledMma mma,
+            Alpha alpha, Beta beta)
+{
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  CUTE_STATIC_ASSERT_V(size(copy_a) == size(mma));                     // NumThreads
+  CUTE_STATIC_ASSERT_V(size(copy_b) == size(mma));                     // NumThreads
+
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));         // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));         // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory tensors
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<TA, TB, ASmemLayout, BSmemLayout>;
+  SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
+  Tensor sA = make_tensor(make_smem_ptr(smem.A.begin()), ASmemLayout{}); // (BLK_M,BLK_K,PIPE)
+  Tensor sB = make_tensor(make_smem_ptr(smem.B.begin()), BSmemLayout{}); // (BLK_N,BLK_K,PIPE)
+
+  //
+  // Partition the copying of A and B tiles across the threads
+  //
+
+  ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
+  Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
+  Tensor sA_ = as_position_independent_swizzle_tensor(sA);
+  Tensor tAsA = thr_copy_a.partition_D(sA_);                           // (CPY,CPY_M,CPY_K,PIPE)
+
+  ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x);
+  Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
+  Tensor sB_ = as_position_independent_swizzle_tensor(sB);
+  Tensor tBsB = thr_copy_b.partition_D(sB_);                           // (CPY,CPY_N,CPY_K,PIPE)
+
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB));                // CPY_K
+
+  //
+  // PREFETCH
+  //
+
+  // auto K_PIPE_MAX = size<3>(tAsA);
+
+  // // Total count of tiles
+  // int k_tile_count = size<3>(tAgA);
+  // // Current tile index in gmem to read from
+  // int k_tile_next = 0;
+
+  // // Start async loads for all pipes but the last
+  // CUTE_UNROLL
+  // for (int k_pipe = 0; k_pipe < K_PIPE_MAX-1; ++k_pipe) {
+  //   copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,k_pipe));
+  //   copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,k_pipe));
+  //   cp_async_fence();
+  //   --k_tile_count;
+  //   if (k_tile_count > 0) { ++k_tile_next; }
+  // }
+
+  //
+  // Define A/B partitioning and C accumulators
+  //
+
+  ThrMMA thr_mma = mma.get_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K,PIPE)
+  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K,PIPE)
+  Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
+
+  // Allocate registers for pipelining
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                         // (MMA,MMA_M,MMA_K,PIPE)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                         // (MMA,MMA_N,MMA_K,PIPE)
+  // Allocate the accumulators -- same size as the projected data
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
+
+  CUTE_STATIC_ASSERT_V((size<1>(tCgC) == size<1>(tCsA)));              // MMA_M
+  CUTE_STATIC_ASSERT_V((size<2>(tCgC) == size<1>(tCsB)));              // MMA_N
+  CUTE_STATIC_ASSERT_V((size<2>(tCsA) == size<2>(tCsB)));              // MMA_K
+
+  // Clear the accumulators
+  clear(tCrC);
+
+#if 0
+  if(thread0()) {
+    print("  mA : "); print(  mA); print("\n");
+    print("  gA : "); print(  gA); print("\n");
+    print("  sA : "); print(  sA); print("\n");
+    print("tAgA : "); print(tAgA); print("\n");
+    print("tAsA : "); print(tAsA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mB : "); print(  mB); print("\n");
+    print("  gB : "); print(  gB); print("\n");
+    print("  sB : "); print(  sB); print("\n");
+    print("tBgB : "); print(tBgB); print("\n");
+    print("tBsB : "); print(tBsB); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mC : "); print(  mC); print("\n");
+    print("  gC : "); print(  gC); print("\n");
+    print("tCsA : "); print(tCsA); print("\n");
+    print("tCsB : "); print(tCsB); print("\n");
+    print("tCgC : "); print(tCgC); print("\n");
+    print("tCrA : "); print(tCrA); print("\n");
+    print("tCrB : "); print(tCrB); print("\n");
+    print("tCrC : "); print(tCrC); print("\n");
+  }
+#endif
+
+#if 1
+
+  // Total number of k-tiles
+  auto K_TILE_MAX  = size<3>(tAgA);
+  // Number of pipelined k-tiles in smem
+  auto K_PIPE_MAX  = size<3>(tAsA);
+
+  //
+  // PREFETCH
+  //
+
+  // Prefetch all but the last
+  CUTE_UNROLL
+  for (int k = 0; k < K_PIPE_MAX-1; ++k)
+  {
+    copy(copy_a, tAgA(_,_,_,k), tAsA(_,_,_,k));
+    copy(copy_b, tBgB(_,_,_,k), tBsB(_,_,_,k));
+    cp_async_fence();
+  }
+
+  // Clear the accumulators
+  clear(tCrC);
+
+  __syncthreads();
+
+  //
+  // PIPELINED MAIN LOOP
+  //
+
+  // Current pipe to read from
+  int k_pipe_read  = 0;
+  // Current pipe to write to
+  int k_pipe_write = K_PIPE_MAX-1;
+
+  CUTE_NO_UNROLL
+  for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+  {
+    int k_tile_next = k_tile + (K_PIPE_MAX-1);
+    k_tile_next = (k_tile_next >= K_TILE_MAX) ? K_TILE_MAX-1 : k_tile_next;
+
+    //
+    // Copy gmem to smem for k_tile_write
+    //
+
+    copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,k_pipe_write));
+    copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,k_pipe_write));
+    cp_async_fence();
+
+    // Advance k_pipe_write
+    ++k_pipe_write;
+    k_pipe_write = (k_pipe_write == K_PIPE_MAX) ? 0 : k_pipe_write;
+
+    //
+    // Compute on k_tile
+    //
+
+    // Wait on all cp.async -- optimize by pipelining to overlap GMEM reads
+    cp_async_wait<0>();
+
+    warpgroup_fence_operand(tCrC);
+    warpgroup_arrive();
+    // (V,M,K) x (V,N,K) => (V,M,N)
+    cute::gemm(mma, tCrA(_,_,_,k_pipe_read), tCrB(_,_,_,k_pipe_read), tCrC);
+    warpgroup_commit_batch();
+    /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+    warpgroup_wait<0>();
+    warpgroup_fence_operand(tCrC);
+
+    // Advance k_pipe_read
+    ++k_pipe_read;
+    k_pipe_read = (k_pipe_read == K_PIPE_MAX) ? 0 : k_pipe_read;
+  }
+
+#endif
+
+  //
+  // Epilogue
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+}
+
+// Setup params for a NT GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int< 64>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
+  auto sB = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
+
+  // Define the thread layouts (static)
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TA>{},
+                                    Layout<Shape<_16,_8>>{}, // Thr layout 32x4 m-major
+                                    Layout<Shape< _8,_1>>{});// Val layout  8x1 m-major
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TB>{},
+                                    Layout<Shape<_16,_8>>{}, // Thr layout 32x4 n-major
+                                    Layout<Shape< _8,_1>>{});// Val layout  8x1 n-major
+
+  TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::MN,GMMA::Major::MN>{});
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  //
+  // Setup and Launch
+  //
+
+  // Launch parameter setup
+  dim3 dimBlock(size(tiled_mma));
+  dim3 dimCluster(1, 1, 1);
+  dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
+               round_up(size(ceil_div(n, bN)), dimCluster.y));
+  int  smemBytes = sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>);
+
+  auto* kernel_ptr = &gemm_device<decltype(prob_shape), decltype(cta_tiler),
+                                  TA, decltype(dA), decltype(sA), decltype(copyA),
+                                  TB, decltype(dB), decltype(sB), decltype(copyB),
+                                  TC, decltype(dC), decltype(tiled_mma),
+                                  decltype(alpha), decltype(beta)>;
+
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
+
+  // Kernel Launch
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
+                                                             prob_shape, cta_tiler,
+                                                             A, dA, sA, copyA,
+                                                             B, dB, sB, copyB,
+                                                             C, dC, tiled_mma,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+// Setup params for a TN GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int< 64>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA = tile_to_shape(GMMA::Layout_K_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
+  auto sB = tile_to_shape(GMMA::Layout_K_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
+
+  // Define the thread layouts (static)
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TA>{},
+                                    Layout<Shape<_16,_8>,Stride<_8,_1>>{}, // Thr layout 16x8 k-major
+                                    Layout<Shape< _1,_8>>{});              // Val layout  1x8
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TB>{},
+                                    Layout<Shape<_16,_8>,Stride<_8,_1>>{}, // Thr layout 16x8 k-major
+                                    Layout<Shape< _1,_8>>{});              // Val layout  1x8
+
+  TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::K,GMMA::Major::K>{});
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  //
+  // Setup and Launch
+  //
+
+  // Launch parameter setup
+  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
+  dim3 dimBlock(size(tiled_mma));
+  dim3 dimCluster(1, 1, 1);
+  dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
+               round_up(size(ceil_div(n, bN)), dimCluster.y));
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smem_size};
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(
+                              &gemm_device<decltype(prob_shape), decltype(cta_tiler),
+                                           TA, decltype(dA), decltype(sA), decltype(copyA),
+                                           TB, decltype(dB), decltype(sB), decltype(copyB),
+                                           TC, decltype(dC), decltype(tiled_mma),
+                                           decltype(alpha), decltype(beta)>);
+
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(
+    kernel_ptr,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    smem_size));
+
+  // Kernel Launch
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, kernel_ptr,
+                                                             prob_shape, cta_tiler,
+                                                             A, dA, sA, copyA,
+                                                             B, dB, sB, copyB,
+                                                             C, dC, tiled_mma,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 8) {
+    std::cout << "This example requires an Ampere GPU or newer (CC >= 80)" << std::endl;
+    // Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED)
+
+  int m = 5120;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 5120;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 4096;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = cute::half_t;
+  using TB = cute::half_t;
+  using TC = cute::half_t;
+  using TI = cute::half_t;
+
+  TI alpha = TI(1.0f);
+  TI beta  = TI(0.0f);
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  // Initialize the tensors
+  for (int j = 0; j < m*k; ++j) h_A[j] = TA(int((rand() % 2) ? 1 : -1));
+  for (int j = 0; j < n*k; ++j) h_B[j] = TB(int((rand() % 2) ? 1 : -1));
+  for (int j = 0; j < m*n; ++j) h_C[j] = TC(0);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+
+#else
+  std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/wgmma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
similarity index 92%
rename from examples/cute/tutorial/wgmma_sm90.cu
rename to examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
index eb634e23..77a30890 100644
--- a/examples/cute/tutorial/wgmma_sm90.cu
+++ b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
@@ -55,8 +55,8 @@ template <class ElementA,
           class SmemLayoutB>  // (N,K,P)
 struct SharedStorage
 {
-  array_aligned<ElementA, cosize_v<SmemLayoutA>> smem_A;
-  array_aligned<ElementB, cosize_v<SmemLayoutB>> smem_B;
+  alignas(128) cute::ArrayEngine<ElementA, cosize_v<SmemLayoutA>> A;
+  alignas(128) cute::ArrayEngine<ElementB, cosize_v<SmemLayoutB>> B;
 
   uint64_t tma_barrier[size<2>(SmemLayoutA{})];
   uint64_t mma_barrier[size<2>(SmemLayoutA{})];
@@ -110,8 +110,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   extern __shared__ char shared_memory[];
   using SharedStorage = SharedStorage<TA, TB, SmemLayoutA, SmemLayoutB>;
   SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
-  Tensor sA = make_tensor(make_smem_ptr(smem.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
-  Tensor sB = make_tensor(make_smem_ptr(smem.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+  Tensor sA = make_tensor(make_smem_ptr(smem.A.begin()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+  Tensor sB = make_tensor(make_smem_ptr(smem.B.begin()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
 
   //
   // Partition the copying of A and B tiles
@@ -132,8 +132,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
                                     group_modes<0,2>(sB), group_modes<0,2>(gB));  // (TMA,k) and (TMA,PIPE)
 
   // The TMA is responsible for copying everything in mode-0 of tAsA and tBsB
-  constexpr int kTmaTransactionBytes = CUTE_STATIC_V(size<0>(tAsA)) * sizeof(TA) +
-                                       CUTE_STATIC_V(size<0>(tBsB)) * sizeof(TB);
+  constexpr int tma_transaction_bytes = sizeof(make_tensor_like(tensor<0>(tAsA)))
+                                      + sizeof(make_tensor_like(tensor<0>(tBsB)));
 
   //
   // PREFETCH
@@ -171,7 +171,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
     if ((warp_idx == 0) && lane_predicate)
     {
       // Set expected Tx Bytes after each reset / init
-      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], kTmaTransactionBytes);
+      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], tma_transaction_bytes);
       copy(tma_a.with(producer_mbar[pipe]), tAgA(_,k_tile), tAsA(_,pipe));
       copy(tma_b.with(producer_mbar[pipe]), tBgB(_,k_tile), tBsB(_,pipe));
     }
@@ -242,7 +242,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
       // Wait for Consumer to complete consumption
       ConsumerBarType::wait(&consumer_mbar[pipe], write_state.phase());
       // Set expected Tx Bytes after each reset / init
-      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], kTmaTransactionBytes);
+      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], tma_transaction_bytes);
       copy(tma_a.with(producer_mbar[pipe]), tAgA(_,k_tile), tAsA(_,pipe));
       copy(tma_b.with(producer_mbar[pipe]), tBgB(_,k_tile), tBsB(_,pipe));
       ++write_state;
@@ -393,27 +393,25 @@ gemm_tn(int m, int n, int k,
   //
 
   // Launch parameter setup
-  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
   dim3 dimBlock(size(tiled_mma));
   dim3 dimCluster(2, 1, 1);
   dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
                round_up(size(ceil_div(n, bN)), dimCluster.y));
-  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smem_size};
+  int  smemBytes = sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>);
 
-  void const* kernel_ptr = reinterpret_cast<void const*>(
-                              &gemm_device<decltype(prob_shape), decltype(cta_tiler),
-                                           TA, decltype(sA), decltype(tmaA),
-                                           TB, decltype(sB), decltype(tmaB),
-                                           TC, decltype(dC), decltype(tiled_mma),
-                                           decltype(alpha), decltype(beta)>);
+  auto* kernel_ptr = &gemm_device<decltype(prob_shape), decltype(cta_tiler),
+                                  TA, decltype(sA), decltype(tmaA),
+                                  TB, decltype(sB), decltype(tmaB),
+                                  TC, decltype(dC), decltype(tiled_mma),
+                                  decltype(alpha), decltype(beta)>;
 
-  CUTE_CHECK_ERROR(cudaFuncSetAttribute(
-    kernel_ptr,
-    cudaFuncAttributeMaxDynamicSharedMemorySize,
-    smem_size));
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smemBytes));
 
   // Kernel Launch
-  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, kernel_ptr,
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
                                                              prob_shape, cta_tiler,
                                                              A, tmaA,
                                                              B, tmaB,
@@ -448,8 +446,10 @@ gemm(char transA, char transB, int m, int n, int k,
 
 int main(int argc, char** argv)
 {
-
   cudaDeviceProp props;
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaGetDeviceProperties(&props, current_device_id);
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
   if (error != cudaSuccess) {
     std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
@@ -461,7 +461,7 @@ int main(int argc, char** argv)
     return 0;
   }
 
-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED)
 
   int m = 512;
   if (argc >= 2)
@@ -553,10 +553,8 @@ int main(int argc, char** argv)
   printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
 
 #else
-
-  std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+  std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
 #endif
 
  return 0;
-
 }
diff --git a/examples/cute/tutorial/sgemm_sm80.cu b/examples/cute/tutorial/sgemm_sm80.cu
index bcc31a0a..50914548 100644
--- a/examples/cute/tutorial/sgemm_sm80.cu
+++ b/examples/cute/tutorial/sgemm_sm80.cu
@@ -41,17 +41,27 @@
 #include "cutlass/util/GPU_Clock.hpp"
 #include "cutlass/util/helper_cuda.hpp"
 
+template <class ElementA,
+          class ElementB,
+          class SmemLayoutA,
+          class SmemLayoutB>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> A;
+  cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> B;
+};
+
 template <class ProblemShape, class CtaTiler,
-          class TA, class AStride, class ASmemLayout, class TiledCopyA,
-          class TB, class BStride, class BSmemLayout, class TiledCopyB,
+          class TA, class AStride, class ASmemLayout, class TiledCopyA, class S2RAtomA,
+          class TB, class BStride, class BSmemLayout, class TiledCopyB, class S2RAtomB,
           class TC, class CStride, class CSmemLayout, class TiledMma,
           class Alpha, class Beta>
 __global__ static
 __launch_bounds__(decltype(size(TiledMma{}))::value)
 void
 gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
-            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
-            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
+            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a, S2RAtomA s2r_atom_a,
+            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b, S2RAtomB s2r_atom_b,
             TC      * C, CStride dC, CSmemLayout          , TiledMma mma,
             Alpha alpha, Beta beta)
 {
@@ -95,10 +105,11 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
 
   // Shared memory buffers
-  __shared__ TA smemA[cosize_v<ASmemLayout>];
-  __shared__ TB smemB[cosize_v<BSmemLayout>];
-  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K,PIPE)
-  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K,PIPE)
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<TA, TB, ASmemLayout, BSmemLayout>;
+  SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
+  Tensor sA = make_tensor(make_smem_ptr(smem.A.begin()), sA_layout);   // (BLK_M,BLK_K,PIPE)
+  Tensor sB = make_tensor(make_smem_ptr(smem.B.begin()), sB_layout);   // (BLK_N,BLK_K,PIPE)
 
   //
   // Partition the copying of A and B tiles across the threads
@@ -143,26 +154,35 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   //
 
   ThrMMA thr_mma = mma.get_slice(threadIdx.x);
-  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K,PIPE)
-  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K,PIPE)
   Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
 
   // Allocate registers for pipelining
-  Tensor tCrA = thr_mma.make_fragment_A(tCsA(_,_,_,0));                // (MMA,MMA_M,MMA_K)
-  Tensor tCrB = thr_mma.make_fragment_B(tCsB(_,_,_,0));                // (MMA,MMA_N,MMA_K)
+  Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0));               // (MMA,MMA_M,MMA_K)
+  Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0));               // (MMA,MMA_N,MMA_K)
   // Allocate the accumulators -- same size as the projected data
   Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
 
-  CUTE_STATIC_ASSERT_V((  shape(tCrA) == take<0,3>(shape(tCsA))));     // (MMA,MMA_M,MMA_K)
-  CUTE_STATIC_ASSERT_V((  shape(tCrB) == take<0,3>(shape(tCsB))));     // (MMA,MMA_N,MMA_K)
   CUTE_STATIC_ASSERT_V((  shape(tCrC) == take<0,3>(shape(tCgC))));     // (MMA,MMA_M,MMA_N)
-  CUTE_STATIC_ASSERT_V((size<1>(tCgC) == size<1>(tCsA)));              // MMA_M
-  CUTE_STATIC_ASSERT_V((size<2>(tCgC) == size<1>(tCsB)));              // MMA_N
-  CUTE_STATIC_ASSERT_V((size<2>(tCsA) == size<2>(tCsB)));              // MMA_K
+  CUTE_STATIC_ASSERT_V((size<1>(tCgC) == size<1>(tCrA)));              // MMA_M
+  CUTE_STATIC_ASSERT_V((size<2>(tCgC) == size<1>(tCrB)));              // MMA_N
 
   // Clear the accumulators
   clear(tCrC);
 
+  //
+  // Copy Atom retiling
+  //
+
+  TiledCopy s2r_copy_a = make_tiled_copy_A(s2r_atom_a, mma);
+  ThrCopy   s2r_thr_copy_a = s2r_copy_a.get_slice(threadIdx.x);
+  Tensor tXsA = s2r_thr_copy_a.partition_S(sA);                        // (CPY,MMA_M,MMA_K,PIPE)
+  Tensor tXrA = s2r_thr_copy_a.retile_D(tCrA);                         // (CPY,MMA_M,MMA_K)
+
+  TiledCopy s2r_copy_b = make_tiled_copy_B(s2r_atom_b, mma);
+  ThrCopy   s2r_thr_copy_b = s2r_copy_b.get_slice(threadIdx.x);
+  Tensor tXsB = s2r_thr_copy_b.partition_S(sB);                        // (CPY,MMA_N,MMA_K,PIPE)
+  Tensor tXrB = s2r_thr_copy_b.retile_D(tCrB);                         // (CPY,MMA_N,MMA_K)
+
 #if 0
   if(thread0()) {
     print("  mA : "); print(  mA); print("\n");
@@ -187,12 +207,15 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   if(thread0()) {
     print("  mC : "); print(  mC); print("\n");
     print("  gC : "); print(  gC); print("\n");
-    print("tCsA : "); print(tCsA); print("\n");
-    print("tCsB : "); print(tCsB); print("\n");
     print("tCgC : "); print(tCgC); print("\n");
     print("tCrA : "); print(tCrA); print("\n");
     print("tCrB : "); print(tCrB); print("\n");
     print("tCrC : "); print(tCrC); print("\n");
+
+    print("tXsA : "); print(tXsA); print("\n");
+    print("tXrA : "); print(tXrA); print("\n");
+    print("tXsB : "); print(tXsB); print("\n");
+    print("tXrB : "); print(tXrB); print("\n");
   }
 #endif
 
@@ -204,8 +227,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   int smem_pipe_write = K_PIPE_MAX-1;
 
   // Pipe slice
-  Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
-  Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+  Tensor tXsA_p = tXsA(_,_,_,smem_pipe_read);
+  Tensor tXsB_p = tXsB(_,_,_,smem_pipe_read);
 
   // Size of the register pipeline
   auto K_BLOCK_MAX = size<2>(tCrA);
@@ -217,8 +240,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
     __syncthreads();
 
     // Prefetch the first rmem from the first k-tile
-    copy(tCsA_p(_,_,Int<0>{}), tCrA(_,_,Int<0>{}));
-    copy(tCsB_p(_,_,Int<0>{}), tCrB(_,_,Int<0>{}));
+    copy(s2r_atom_a, tXsA_p(_,_,Int<0>{}), tXrA(_,_,Int<0>{}));
+    copy(s2r_atom_b, tXsB_p(_,_,Int<0>{}), tXrB(_,_,Int<0>{}));
   }
 
   //
@@ -243,8 +266,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
       if (k_block == K_BLOCK_MAX - 1)
       {
         // Slice the smem_pipe_read smem
-        tCsA_p = tCsA(_,_,_,smem_pipe_read);
-        tCsB_p = tCsB(_,_,_,smem_pipe_read);
+        tXsA_p = tXsA(_,_,_,smem_pipe_read);
+        tXsB_p = tXsB(_,_,_,smem_pipe_read);
 
         // Commit the smem for smem_pipe_read
         cp_async_wait<K_PIPE_MAX-2>();
@@ -253,8 +276,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
       // Load A, B shmem->regs for k_block+1
       auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;      // static
-      copy(tCsA_p(_,_,k_block_next), tCrA(_,_,k_block_next));
-      copy(tCsB_p(_,_,k_block_next), tCrB(_,_,k_block_next));
+      copy(s2r_atom_a, tXsA_p(_,_,k_block_next), tXrA(_,_,k_block_next));
+      copy(s2r_atom_b, tXsB_p(_,_,k_block_next), tXrB(_,_,k_block_next));
       // Copy gmem to smem before computing gemm on each k-pipe
       if (k_block == 0)
       {
@@ -268,8 +291,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
         // Advance the smem pipe
         smem_pipe_write = smem_pipe_read;
-        ++smem_pipe_read;
-        smem_pipe_read = (smem_pipe_read == K_PIPE_MAX) ? 0 : smem_pipe_read;
+        smem_pipe_read = (smem_pipe_read == K_PIPE_MAX-1) ? 0 : smem_pipe_read+1;
       }
       // Thread-level register gemm for k_block
       gemm(mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
@@ -286,6 +308,126 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   axpby(alpha, tCrC, beta, tCgC);
 }
 
+template <class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        cute::half_t const* A, int ldA,
+        cute::half_t const* B, int ldB,
+        Beta beta,
+        cute::half_t      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  assert(false && "Not implemented");
+}
+
+// Setup params for a TN HGEMM
+template <class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        cute::half_t const* A, int ldA,
+        cute::half_t const* B, int ldB,
+        Beta beta,
+        cute::half_t      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int< 64>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  // Swizzles for LDSM and 128b k-major loads
+  auto swizzle_atom = composition(Swizzle<3,3,3>{},
+                                  Layout<Shape <_8,Shape <_8, _8>>,
+                                         Stride<_8,Stride<_1,_64>>>{});
+
+  auto sA = tile_to_shape(swizzle_atom, make_shape(bM,bK,bP));
+  auto sB = tile_to_shape(swizzle_atom, make_shape(bN,bK,bP));
+  auto sC = make_layout(make_shape(bM, bN));
+
+  // Define the thread layouts (static)
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, cute::half_t>{},
+                                    Layout<Shape<_16,_8>,Stride<_8,_1>>{},  // Thr layout 16x8 k-major
+                                    Layout<Shape< _1,_8>>{});               // Val layout  1x8 k-major
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, cute::half_t>{},
+                                    Layout<Shape<_16,_8>,Stride<_8,_1>>{},  // Thr layout 16x8 k-major
+                                    Layout<Shape< _1,_8>>{});               // Val layout  1x8 n-major
+
+  TiledMMA mmaC = make_tiled_mma(SM80_16x8x8_F16F16F16F16_TN{},
+                                 Layout<Shape<_2,_2>>{},    // 2x2x1 MMA Atoms
+                                 Tile<_32,_32,_16>{});      // 32x32x16 Tiled MMA for LDSM
+
+  //Copy_Atom<DefaultCopy, half_t> s2r_atom_A;
+  //Copy_Atom<UniversalCopy<half_t>, half_t> s2r_atom_A;
+  //Copy_Atom<SM75_U32x1_LDSM_N, half_t> s2r_atom_A;
+  //Copy_Atom<SM75_U32x2_LDSM_N, half_t> s2r_atom_A;
+  Copy_Atom<SM75_U32x4_LDSM_N, half_t> s2r_atom_A;
+
+  //Copy_Atom<DefaultCopy, half_t> s2r_atom_B;
+  //Copy_Atom<UniversalCopy<half_t>, half_t> s2r_atom_B;
+  //Copy_Atom<SM75_U32x1_LDSM_N, half_t> s2r_atom_B;
+  //Copy_Atom<SM75_U32x2_LDSM_N, half_t> s2r_atom_B;
+  Copy_Atom<SM75_U32x4_LDSM_N, half_t> s2r_atom_B;
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  int smem_size = int(sizeof(SharedStorage<cute::half_t, cute::half_t, decltype(sA), decltype(sB)>));
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+
+  auto kernel_fptr = gemm_device<
+    decltype(prob_shape), decltype(cta_tiler),
+    cute::half_t, decltype(dA), decltype(sA), decltype(copyA), decltype(s2r_atom_A),
+    cute::half_t, decltype(dB), decltype(sB), decltype(copyB), decltype(s2r_atom_B),
+    cute::half_t, decltype(dC), decltype(sC), decltype(mmaC),
+    decltype(alpha), decltype(beta)>;
+
+  // Set L1 to be SMEM only
+  cudaFuncSetAttribute(
+    kernel_fptr,
+    cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+  cudaFuncSetAttribute(
+    kernel_fptr,
+    cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+  kernel_fptr<<<dimGrid, dimBlock, smem_size, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA, s2r_atom_A,
+       B, dB, sB, copyB, s2r_atom_B,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
 // Setup params for a NT GEMM
 template <class TA, class TB, class TC,
           class Alpha, class Beta>
@@ -347,13 +489,14 @@ gemm_nt(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
+  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
   dim3 dimBlock(size(mmaC));
   dim3 dimGrid(size(ceil_div(M, bM)),
                size(ceil_div(N, bN)));
-  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+  gemm_device<<<dimGrid, dimBlock, smem_size, stream>>>
       (prob_shape, cta_tiler,
-       A, dA, sA, copyA,
-       B, dB, sB, copyB,
+       A, dA, sA, copyA, AutoVectorizingCopy{},
+       B, dB, sB, copyB, AutoVectorizingCopy{},
        C, dC, sC, mmaC,
        alpha, beta);
 }
@@ -423,13 +566,14 @@ gemm_tn(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
+  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
   dim3 dimBlock(size(mmaC));
   dim3 dimGrid(size(ceil_div(M, bM)),
                size(ceil_div(N, bN)));
-  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+  gemm_device<<<dimGrid, dimBlock, smem_size, stream>>>
       (prob_shape, cta_tiler,
-       A, dA, sA, copyA,
-       B, dB, sB, copyB,
+       A, dA, sA, copyA, AutoVectorizingCopy{},
+       B, dB, sB, copyB, AutoVectorizingCopy{},
        C, dC, sC, mmaC,
        alpha, beta);
 }
@@ -470,6 +614,11 @@ int main(int argc, char** argv)
     return 0;
   }
 
+  std::cout << "Using device 0: " << props.name
+            << " (SM" << props.major * 10 + props.minor
+            << ", " << props.multiProcessorCount
+            << ")" << std::endl;
+
   int m = 5120;
   if (argc >= 2)
     sscanf(argv[1], "%d", &m);
@@ -490,13 +639,13 @@ int main(int argc, char** argv)
   if (argc >= 6)
     sscanf(argv[5], "%c", &transB);
 
-  using TA = float;
-  using TB = float;
-  using TC = float;
-  using TI = float;
+  using TA = cute::half_t;
+  using TB = cute::half_t;
+  using TC = cute::half_t;
+  using TI = cute::half_t;
 
-  TI alpha = 1.0;
-  TI beta  = 0.0;
+  TI alpha = static_cast<TI>(1.0f);
+  TI beta  = static_cast<TI>(0.0f);
 
   std::cout << "M = " << m << std::endl;
   std::cout << "N = " << n << std::endl;
diff --git a/examples/python/README.md b/examples/python/README.md
index 590f2e24..0e69a409 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -20,3 +20,35 @@
 * [04_epilogue_visitor](/examples/python/04_epilogue_visitor.ipynb)
 
     Shows how to fuse elementwise activation functions to GEMMs via the Python Epilogue Visitor interface
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
index b97fc4c8..91589538 100644
--- a/include/cute/arch/config.hpp
+++ b/include/cute/arch/config.hpp
@@ -32,6 +32,11 @@
 
 #include <cutlass/arch/config.h> // CUTLASS_ARCH_MMA_SMxx_ENABLED
 
+// MMA SM90A
+#if defined(CUTLASS_ARCH_MMA_SM90A_ENABLED)
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
 // TMA instructions
 #if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
 #  define CUTE_ARCH_TMA_SM90_ENABLED
@@ -48,41 +53,59 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUTE_ARCH_TMA_SM90_ENABLED
 #  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
 #  define CUTE_ARCH_STSM_SM90_ENABLED
 #endif
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED
 #  define CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED
 #  define CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED
 #  define CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED
 #  define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
 #  define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
 #endif
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
 #endif
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUTE_ARCH_LDSM_SM100A_ENABLED
 #  define CUTE_ARCH_STSM_SM100A_ENABLED
 #endif
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_TMEM_ENABLED
 #endif
 
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TMA_SM100_ENABLED
 #endif
 
 // {add, mul, fma}.f32x2 PTX
-#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
   #define CUTE_ARCH_FLOAT2_MATH_ENABLED
 #endif
 
+#if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
+#  define CUTE_ARCH_MMA_SM120_ENABLED
+#  define CUTE_ARCH_TMA_SM120_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
+#  if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+#    define CUTE_ARCH_F8F6F4_MMA_ENABLED
+#    define CUTE_ARCH_MXF8F6F4_MMA_ENABLED
+#    define CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED
+#    define CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED
+#  endif
+#endif
+
diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
index f5f50647..095cde5b 100644
--- a/include/cute/arch/copy_sm90_desc.hpp
+++ b/include/cute/arch/copy_sm90_desc.hpp
@@ -208,7 +208,7 @@ to_CUtensorMapDataType() {
   if constexpr (is_same_v<T,      uint8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
   if constexpr (is_same_v<T, float_e4m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
   if constexpr (is_same_v<T, float_e5m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
-  if constexpr (is_same_v<T, float_ue8m0_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_ue8m0_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;   } else
   if constexpr (is_same_v<T, type_erased_dynamic_float8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;} else 
   if constexpr (is_same_v<T,     uint16_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT16;   } else
   if constexpr (is_same_v<T,     uint32_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT32;   } else
@@ -221,18 +221,18 @@ to_CUtensorMapDataType() {
   if constexpr (is_same_v<T,   bfloat16_t>) { return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; } else
   if constexpr (is_same_v<T,   tfloat32_t>) { return CU_TENSOR_MAP_DATA_TYPE_TFLOAT32; } else
   #if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ > 6)))
-  if constexpr (is_same_v<T, float_e2m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, float_e3m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, float_e2m1_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B;} else
-  if constexpr (is_same_v<T, cutlass::detail::float_e2m1_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B;} else
-  if constexpr (is_same_v<T, cutlass::detail::float_e2m3_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, cutlass::detail::float_e3m2_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, detail::type_erased_dynamic_float6_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, type_erased_dynamic_float6_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B;} else
-  if constexpr (is_same_v<T, detail::type_erased_dynamic_float4_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B;} else
-  if constexpr (is_same_v<T, type_erased_dynamic_float4_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } else
+  if constexpr (is_same_v<T, float_e2m1_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B;  } else
+  if constexpr (is_same_v<T, float_e2m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
+  if constexpr (is_same_v<T, float_e3m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
+  if constexpr (is_same_v<T, type_erased_dynamic_float4_t>)    { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B;  } else
+  if constexpr (is_same_v<T, type_erased_dynamic_float6_t>)    { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
+  if constexpr (is_same_v<T, detail::float_e2m1_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } else
+  if constexpr (is_same_v<T, detail::float_e2m3_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
+  if constexpr (is_same_v<T, detail::float_e3m2_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
+  if constexpr (is_same_v<T, detail::type_erased_dynamic_float4_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } else
+  if constexpr (is_same_v<T, detail::type_erased_dynamic_float6_unpacksmem_t>) { return CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B; } else
   #endif
-  
+
   { static_assert(sizeof(T) < 0, "Unknown TMA Format!"); }
 }
 
@@ -258,7 +258,6 @@ to_CUtensorMapSwizzle(SmemSwizzleBits const& t, SmemSwizzleBase const& b) {
         case SmemSwizzleBase::SWIZZLE_BASE_32B: return CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B;
         case SmemSwizzleBase::SWIZZLE_BASE_64B: return CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B;
         #endif
-        
       }
   }
 }
diff --git a/include/cute/arch/copy_sm90_tma.hpp b/include/cute/arch/copy_sm90_tma.hpp
index a4bc3794..ec156449 100644
--- a/include/cute/arch/copy_sm90_tma.hpp
+++ b/include/cute/arch/copy_sm90_tma.hpp
@@ -56,6 +56,15 @@ struct SM90_TMA_LOAD_1D
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
     cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    asm volatile (
+      "cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3}], [%2], %4;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "l"(cache_hint)
+      : "memory");
+#else
     asm volatile (
       "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
       " [%0], [%1, {%3}], [%2], %4;"
@@ -63,6 +72,7 @@ struct SM90_TMA_LOAD_1D
       : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
         "r"(crd0), "l"(cache_hint)
       : "memory");
+#endif
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
@@ -102,6 +112,15 @@ struct SM90_TMA_LOAD_2D
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
     cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    asm volatile (
+      "cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4}], [%2], %5;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "l"(cache_hint)
+      : "memory");
+#else
     asm volatile (
       "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
       " [%0], [%1, {%3, %4}], [%2], %5;"
@@ -109,6 +128,7 @@ struct SM90_TMA_LOAD_2D
       : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
         "r"(crd0), "r"(crd1), "l"(cache_hint)
       : "memory");
+#endif
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
@@ -148,6 +168,15 @@ struct SM90_TMA_LOAD_3D
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
     cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5}], [%2], %6;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+      : "memory");
+#else
     asm volatile (
       "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
       " [%0], [%1, {%3, %4, %5}], [%2], %6;"
@@ -155,6 +184,7 @@ struct SM90_TMA_LOAD_3D
       : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
         "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
       : "memory");
+#endif
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
@@ -194,6 +224,15 @@ struct SM90_TMA_LOAD_4D
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
     cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
+      : "memory");
+#else
     asm volatile (
       "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
       " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
@@ -201,6 +240,7 @@ struct SM90_TMA_LOAD_4D
       : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
         "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
       : "memory");
+#endif
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
@@ -240,6 +280,15 @@ struct SM90_TMA_LOAD_5D
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
     cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
+      : "memory");
+#else
     asm volatile (
       "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
       " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
@@ -247,6 +296,7 @@ struct SM90_TMA_LOAD_5D
       : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
         "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
       : "memory");
+#endif
 #else
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
@@ -581,6 +631,9 @@ struct SM90_TMA_LOAD_MULTICAST_1D
        int32_t const& crd0)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -607,6 +660,9 @@ struct SM90_TMA_LOAD_MULTICAST_2D
        int32_t const& crd0, int32_t const& crd1)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -633,6 +689,9 @@ struct SM90_TMA_LOAD_MULTICAST_3D
        int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -659,6 +718,9 @@ struct SM90_TMA_LOAD_MULTICAST_4D
        int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -685,6 +747,9 @@ struct SM90_TMA_LOAD_MULTICAST_5D
        int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -757,6 +822,9 @@ struct SM90_TMA_LOAD_IM2COL_MULTICAST_3D
        uint16_t const& offset_w)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -786,6 +854,9 @@ struct SM90_TMA_LOAD_IM2COL_MULTICAST_4D
        uint16_t const& offset_w, uint16_t const& offset_h)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
@@ -815,6 +886,9 @@ struct SM90_TMA_LOAD_IM2COL_MULTICAST_5D
        uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
   {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_TMA_SM120_ENABLED)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
diff --git a/include/cute/arch/mma_sm100_desc.hpp b/include/cute/arch/mma_sm100_desc.hpp
index 3d748061..f15108a4 100644
--- a/include/cute/arch/mma_sm100_desc.hpp
+++ b/include/cute/arch/mma_sm100_desc.hpp
@@ -552,7 +552,8 @@ make_runtime_instr_desc(UMMA::InstrDescriptor desc_i, uint16_t sparse_id2 = 0u,
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One,
-          bool is_sparse = false>
+          bool is_sparse = false
+          >
 CUTE_HOST_DEVICE constexpr
 UMMA::InstrDescriptorBlockScaled
 make_instr_desc_block_scaled()
diff --git a/include/cute/arch/mma_sm100_umma.hpp b/include/cute/arch/mma_sm100_umma.hpp
index 1f74223b..4b6d7f86 100644
--- a/include/cute/arch/mma_sm100_umma.hpp
+++ b/include/cute/arch/mma_sm100_umma.hpp
@@ -28,9 +28,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-//
-
-//
 
 #pragma once
 
@@ -303,6 +300,92 @@ struct SM100_MMA_F16BF16_TS_SCALED
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_TF32_SS_SPARSE
+{
+  static_assert(M == 64 || M == 128, "SM100_MMA_TF32_SS_SPARSE M-mode size should be 64 or 128 for 1 CTA cluster MMA.");
+  static_assert((M == 64  && (N % 8 == 0)  && (8 <= N)  && (N <= 256)) ||
+                (M == 128 && (N % 16 == 0) && (16 <= N) && (N <= 256)),
+                "SM100_MMA_TF32_SS_SPARSE N-mode size should be a multiple of 8 between 8 and 256 for M=64,\
+                 or a multiple of 16 between 16 and 256 for M=128.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[4] = {0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::1.kind::tf32 [%0], %1, %2, [%9], %3, {%5, %6, %7, %8}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_TF32_SS_SPARSE without CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED");
+#endif
+  }
+};
+
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_F16BF16_SS_SPARSE
+{
+  static_assert(M == 64 || M == 128, "SM100_MMA_F16BF16_SS_SPARSE M-mode size should be 64 or 128 for 1 CTA cluster MMA.");
+  static_assert((M == 64  && (N % 8 == 0)  && (8 <= N)  && (N <= 256)) ||
+                (M == 128 && (N % 16 == 0) && (16 <= N) && (N <= 256)),
+                "SM100_MMA_F16BF16_SS_SPARSE N-mode size should be a multiple of 8 between 8 and 256 for M=64,\
+                 or a multiple of 16 between 16 and 256 for M=128.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[4] = {0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::1.kind::f16 [%0], %1, %2, [%9], %3, {%5, %6, %7, %8}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_F16BF16_SS_SPARSE without CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED");
+#endif
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
@@ -551,6 +634,88 @@ struct SM100_MMA_F16BF16_2x1SM_TS_SCALED
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_TF32_2x1SM_SS_SPARSE
+{
+  static_assert(M == 128 || M == 256, "SM100_MMA_TF32_2x1SM_SS_SPARSE M-mode size should be 128 or 256 for 2 CTA cluster MMA.");
+  static_assert((N % 32 == 0) && (32 <= N) && (N <= 256), "SM100_MMA_TF32_2x1SM_SS_SPARSE N-mode size should be a multiple of 32 between 32 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::2.kind::tf32 [%0], %1, %2, [%13], %3, {%5, %6, %7, %8, %9, %10, %11, %12}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]),
+          "r"(mask[4]), "r"(mask[5]), "r"(mask[6]), "r"(mask[7]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_TF32_2x1SM_SS_SPARSE without CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED");
+#endif
+  }
+};
+
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_F16BF16_2x1SM_SS_SPARSE
+{
+  static_assert(M == 128 || M == 256, "SM100_MMA_F16BF16_2x1SM_SS_SPARSE M-mode size should be 128 or 256 for 2 CTA cluster MMA.");
+  static_assert((N % 32 == 0) && (32 <= N) && (N <= 256), "SM100_MMA_F16BF16_2x1SM_SS_SPARSE N-mode size should be a multiple of 32 between 32 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::2.kind::f16 [%0], %1, %2, [%13], %3, {%5, %6, %7, %8, %9, %10, %11, %12}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]),
+          "r"(mask[4]), "r"(mask[5]), "r"(mask[6]), "r"(mask[7]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_F16BF16_2x1SM_SS_SPARSE without CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED");
+#endif
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::Saturate c_sat = UMMA::Saturate::False>
@@ -632,6 +797,47 @@ struct SM100_MMA_S8_TS
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::Saturate c_sat = UMMA::Saturate::False>
+struct SM100_MMA_S8_SS_SPARSE
+{
+  static_assert(is_same_v<c_type, int32_t>, "SM100_MMA_S8_SS_SPARSE result type can only be int32_t.");
+  static_assert(M == 64 || M == 128, "SM100_MMA_S8_SS_SPARSE M-mode size should be 64 or 128 for 1 CTA cluster MMA.");
+  static_assert(N == 8 || ((N % 16 == 0) && (16 <= N) && (N <= 256)), "SM100_MMA_S8_SS_SPARSE N-mode size should be 8 or a multiple of 16 between 16 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_S8_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[4] = {0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::1.kind::i8 [%0], %1, %2, [%9], %3, {%5, %6, %7, %8}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_S8_SS_SPARSE without CUTE_ARCH_TCGEN05_S8_MMA_ENABLED");
+#endif
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::Saturate c_sat = UMMA::Saturate::False>
@@ -714,10 +920,49 @@ struct SM100_MMA_S8_2x1SM_TS
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::Saturate c_sat = UMMA::Saturate::False>
+struct SM100_MMA_S8_2x1SM_SS_SPARSE
+{
+  static_assert(M == 128 || M == 256, "SM100_MMA_S8 M-mode size should be 128 or 256 for 2 CTA cluster MMA.");
+  static_assert((N % 32 == 0) && (32 <= N) && (N <= 256), "SM100_MMA_S8 N-mode size should be a multiple of 32 between 32 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_S8_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::2.kind::i8 [%0], %1, %2, [%13], %3, {%5, %6, %7, %8, %9, %10, %11, %12}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]),
+          "r"(mask[4]), "r"(mask[5]), "r"(mask[6]), "r"(mask[7]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_S8_2x1SM_SS_SPARSE without CUTE_ARCH_TCGEN05_S8_MMA_ENABLED");
+#endif
+  }
+};
+
 struct SM100_MMA_F8F6F4_SS
 {
-
-
   using DRegisters = void;
   using ARegisters = uint64_t[1];
   using BRegisters = uint64_t[1];
@@ -876,6 +1121,91 @@ struct SM100_MMA_F8F6F4_2x1SM_TS
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_F8F6F4_SS_SPARSE
+{
+  static_assert(M == 64 || M == 128, "SM100_MMA_F8F6F4_SS_SPARSE M-mode size should be 64 or 128 for 1 CTA cluster MMA.");
+  static_assert((M == 64  && (N % 8 == 0)  && (8 <= N)  && (N <= 256)) ||
+                (M == 128 && (N % 16 == 0) && (16 <= N) && (N <= 256)),
+                "SM100_MMA_F8F6F4_SS_SPARSE N-mode size should be a multiple of 8 between 8 and 256 for M=64,\
+                 or a multiple of 16 between 16 and 256 for M=128.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[4] = {0, 0, 0, 0}; // %5, %6, %7, %8
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::1.kind::f8f6f4 [%0], %1, %2, [%9], %3, {%5, %6, %7, %8}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_F8F6F4_SS_SPARSE without CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_MXF8F6F4_SS_SPARSE
+{
+  static_assert(M == 128, "SM100_MMA_MXF8F6F4_SS_SPARSE M-mode size should be 128 for 1 CTA cluster MMA.");
+  static_assert((N % 8 == 0) && (8 <= N) && (N <= 256), "SM100_MMA_MXF8F6F4_SS_SPARSE N-mode size should be a multiple of 8 between 8 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+  using SFARegisters = uint32_t[1];
+  using SFBRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tsfa_addr,
+      uint32_t const& tsfb_addr,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF8F6F4_SS_SPARSE without CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
 struct SM100_MMA_F8F6F4_2x1SM_SS
 {  
   using DRegisters = void;
@@ -910,6 +1240,47 @@ struct SM100_MMA_F8F6F4_2x1SM_SS
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE
+{
+  static_assert(M == 256, "SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE M-mode size should be 256 for 2 CTA cluster MMA.");
+  static_assert((N % 16 == 0) && (16 <= N) && (N <= 256), "SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE N-mode size should be a multiple of 16 between 16 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tsfa_addr,
+      uint32_t const& tsfb_addr,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE without CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
@@ -950,6 +1321,46 @@ struct SM100_MMA_MXF8F6F4_2x1SM_SS
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_F8F6F4_2x1SM_SS_SPARSE
+{
+  static_assert(M == 128 || M == 256, "SM100_MMA_F8F6F4_2x1SM_SS_SPARSE M-mode size should be 128 or 256 for 2 CTA cluster MMA.");
+  static_assert((N % 32 == 0) && (32 <= N) && (N <= 256), "SM100_MMA_F8F6F4_2x1SM_SS_SPARSE N-mode size should be a multiple of 32 between 32 and 256.");
+
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tmem_e)
+  {
+#if defined(CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED)
+    if (cute::elect_one_sync()) {
+      uint32_t mask[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.sp.cta_group::2.kind::f8f6f4 [%0], %1, %2, [%13], %3, {%5, %6, %7, %8, %9, %10, %11, %12}, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+          "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]),
+          "r"(mask[4]), "r"(mask[5]), "r"(mask[6]), "r"(mask[7]), "r"(tmem_e));
+    }
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_F8F6F4_2x1SM_SS_SPARSE without CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
 
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
@@ -1014,7 +1425,68 @@ struct SM100_MMA_MXF4_SS
 
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_MXF4NVF4_SS_SPARSE
+{
+  static_assert(M == 128, "SM100_MMA_MXF4NVF4_SS_SPARSE M-mode size should be 128 for 1 CTA cluster MMA.");
+  static_assert((N % 8 == 0) && (8 <= N) && (N <= 256), "SM100_MMA_MXF4NVF4_SS_SPARSE N-mode size should be a multiple of 8 between 8 and 256.");
 
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[1];
+  using SFARegisters = uint32_t[1];
+  using SFBRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tsfa_addr,
+      uint32_t const& tsfb_addr,
+      uint32_t const& tmem_e)
+  {
+    if constexpr (VS == 32) {
+#if defined(CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED)
+      if (cute::elect_one_sync()) {
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          "setp.ne.b32 p, %4, 0;\n\t"
+          "tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+          "}\n"
+          :
+          : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+            "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+      }
+#else
+      CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF4NVF4_SS_SPARSE (VS = 32) without CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED");
+#endif
+    }
+
+    if constexpr (VS == 64) {
+#if defined(CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED)
+      if (cute::elect_one_sync()) {
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          "setp.ne.b32 p, %4, 0;\n\t"
+          "tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+          "}\n"
+          :
+          : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+            "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+      }
+#else
+      CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF4NVF4_SS_SPARSE (VS = 64) without CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED");
+#endif
+    }
+  }
+};
 
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
@@ -1078,5 +1550,67 @@ struct SM100_MMA_MXF4_2x1SM_SS
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
+struct SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE
+{
+  static_assert((N % 16 == 0) && (16 <= N) && (N <= 256), "SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE N-mode size should be a multiple of 16 between 16 and 256.");
+  static_assert((VS == 32) || (VS == 64), "SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE Vector size can only be 32 or 64.");
+
+  using DRegisters   = void;
+  using ARegisters   = uint64_t[1];
+  using BRegisters   = uint64_t[1];
+  using CRegisters   = uint32_t[1];
+  using SFARegisters = uint32_t[1];
+  using SFBRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t const& tmem_c,
+      uint32_t const& scaleC,
+      uint64_t const& idescE,
+      uint32_t const& tsfa_addr,
+      uint32_t const& tsfb_addr,
+      uint32_t const& tmem_e)
+  {
+    if constexpr (VS == 32) {
+#if defined(CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED)
+      if (cute::elect_one_sync()) {
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          "setp.ne.b32 p, %4, 0;\n\t"
+          "tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+          "}\n"
+          :
+          : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+            "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+      }
+#else
+      CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE (VS = 32) without CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED");
+#endif
+    }
+
+    if constexpr (VS == 64) {
+#if defined(CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED)
+      if (cute::elect_one_sync()) {
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          "setp.ne.b32 p, %4, 0;\n\t"
+          "tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+          "}\n"
+          :
+          : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
+            "r"(tsfa_addr), "r"(tsfb_addr), "r"(tmem_e));
+      }
+#else
+      CUTE_INVALID_CONTROL_PATH("Attempting to use SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE (VS = 64) without CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED");
+#endif
+    }
+  }
+};
 
 } // end namespace cute
diff --git a/include/cute/arch/mma_sm120.hpp b/include/cute/arch/mma_sm120.hpp
new file mode 100644
index 00000000..84c09b8b
--- /dev/null
+++ b/include/cute/arch/mma_sm120.hpp
@@ -0,0 +1,3254 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include <cute/arch/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/numeric/numeric_types.hpp> // cute::float_e4m3_t, etc
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class a_type, class b_type, class c_type>
+struct SM120_16x8x32_TN
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_16x8x32_TN for given data types.");
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M1 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m1.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M1 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m1.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M1 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m1.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M1 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m1.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M1 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m1.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// MMA 16x8x32 TN E3M2 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e3m2.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E3M2 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e3m2.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E3M2 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e3m2.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E3M2 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e3m2.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E3M2 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e3m2.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+// MMA 16x8x32 TN E2M3 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m3.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M3 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m3.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M3 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m3.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M3 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m3.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E2M3 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e2m3.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+// MMA 16x8x32 TN E4M3 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E4M3 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E4M3 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E4M3 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E4M3 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+// MMA 16x8x32 TN E5M2 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e5m2.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E5M2 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e5m2.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E5M2 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e5m2.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E5M2 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN E5M2 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// MMA.F16 16x8x32 TN E2M1 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m1.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M1 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m1.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M1 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m1.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M1 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m1.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M1 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e2m1_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m1.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E3M2 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e3m2.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E3M2 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e3m2.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E3M2 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e3m2.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E3M2 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e3m2.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E3M2 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e3m2_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e3m2.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M3 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m3.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M3 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m3.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M3 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m3.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M3 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m3.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E2M3 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e2m3_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e2m3.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// MMA.F16 16x8x32 TN E4M3 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e4m3.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E4M3 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e4m3.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E4M3 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e4m3.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E4M3 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e4m3.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E4M3 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e4m3_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e4m3.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E5M2 x E2M1
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e5m2.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E5M2 x E3M2
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e5m2.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E5M2 x E2M3
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e5m2.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E5M2 x E4M3
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e5m2.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.F16 16x8x32 TN E5M2 x E5M2
+template <>
+struct SM120_16x8x32_TN<float_e5m2_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f16.e5m2.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_16x8x32_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace SM120::BLOCKSCALED {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct SM120_16x8x32_TN_VS
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_16x8x32_TN_VS for given data types.");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct SM120_16x8x64_TN_VS
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_16x8x64_TN_VS for given data types.");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M1 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)), "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)), "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M1 x E3M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m1_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+ 
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m1.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M1 x E2M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m1_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+ 
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m1.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M1 x E4M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m1_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+ 
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m1.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M1 x E5M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m1_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m1.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E3M2 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e3m2_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e3m2.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E3M2 x E3M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e3m2_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e3m2.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E3M2 x E2M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e3m2_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e3m2.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E3M2 x E4M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e3m2_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e3m2.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E3M2 x E5M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e3m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e3m2.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M3 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m3_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m3.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M3 x E3M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m3_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m3.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M3 x E2M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m3_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m3.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M3 x E4M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m3_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m3.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E2M3 x E5M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e2m3_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e2m3.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E4M3 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e4m3_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e4m3.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E4M3 x E3M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e4m3_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e4m3.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E4M3 x E2M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e4m3_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e4m3.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E4M3 x E4M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e4m3_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e4m3.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E4M3 x E5M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e4m3_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e4m3.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E5M2 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E5M2 x E3M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E5M2 x E2M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E5M2 x E4M3 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x32 TN E5M2 x E5M2 with SF UE8M0
+template <int VS>
+struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      uint8_t const& sfa0,
+      uint8_t const& sfb0)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA.SF 16x8x64 TN E2M1 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_16x8x64_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  static constexpr int SFBits = (VS == 32) ? 16 : 32;
+  using RegTypeSF = uint_bit_t<SFBits>;
+
+  using SFARegisters = RegTypeSF[1];
+  using SFBRegisters = RegTypeSF[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      RegTypeSF const& sfa0,
+      RegTypeSF const& sfb0)
+  {
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+    CUTE_STATIC_ASSERT(VS == 16 || VS == 32, "Scaling factor vector size has to be 16 or 32 for MXF4NVF4 MMA.");
+    
+#if defined(CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED)
+      asm volatile(
+      "mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13},"
+      "{%14},"
+      "{%15, %16},"
+      "{%17},"
+      "{%18, %19};\n"
+      :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+      :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+          "r"(b0),   "r"(b1),
+          "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+          "r"(uint32_t(sfa0)),  "h"(bidA), "h"(tidA),
+          "r"(uint32_t(sfb0)),  "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x64_TN_VS without CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED");
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// MMA.SF 16x8x64 TN E2M1 x E2M1 with SF E4M3
+template <int VS>
+struct SM120_16x8x64_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue4m3_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  static constexpr int SFBits = (VS == 32) ? 16 : 32;
+  using RegTypeSF = uint_bit_t<SFBits>;
+
+  using SFARegisters = RegTypeSF[1];
+  using SFBRegisters = RegTypeSF[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
+      RegTypeSF const& sfa0,
+      RegTypeSF const& sfb0)
+  {
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+ CUTE_STATIC_ASSERT(VS == 16 || VS == 32, "Scaling factor vector size has to be 16 or 32 for MXF4NVF4 MMA.");
+#if defined(CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED)
+    asm volatile(
+    "mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9},"
+    "{%10, %11, %12, %13},"
+    "{%14},"
+    "{%15, %16},"
+    "{%17},"
+    "{%18, %19};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
+        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x64_TN_VS without CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED");
+#endif
+
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM120::BLOCKSCALED
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rr_op_selector_sm120()
+{
+  return SM120_16x8x32_TN<ElementA, ElementB, ElementC>{};
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class ElementSF,
+  int   SFVecSize,
+  bool  UseF8F6F4
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rr_blockscaled_op_selector_sm120()
+{
+  if constexpr (UseF8F6F4) {
+    return SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<ElementA, ElementB, ElementC, ElementSF, SFVecSize>{};
+  } 
+  else{
+    return SM120::BLOCKSCALED::SM120_16x8x64_TN_VS<ElementA, ElementB, ElementC, ElementSF, SFVecSize>{};
+  }
+}
+
+} // namespace cute
diff --git a/include/cute/arch/mma_sm120_sparse.hpp b/include/cute/arch/mma_sm120_sparse.hpp
new file mode 100644
index 00000000..a6950032
--- /dev/null
+++ b/include/cute/arch/mma_sm120_sparse.hpp
@@ -0,0 +1,3444 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/numeric/numeric_types.hpp>  // cute::float_e4m3_t, etc
+#include <cute/arch/mma_sm120.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cute {
+
+namespace SM120::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class a_type, class b_type, class c_type>
+struct SM120_SPARSE_16x8x64_TN
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_SPARSE_16x8x64_TN for given data types.");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M1 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m1.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M1 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m1.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M1 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m1.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M1 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m1.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M1 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m1.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E3M2 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e3m2.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E3M2 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e3m2.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E3M2 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e3m2.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E3M2 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e3m2.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E3M2 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e3m2.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M3 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m3.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M3 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m3.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M3 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m3.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M3 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m3.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E2M3 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e2m3.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E4M3 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e4m3.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E4M3 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e4m3.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E4M3 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e4m3.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E4M3 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E4M3 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E5M2 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e2m1_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e5m2.e2m1.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E5M2 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e3m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e5m2.e3m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E5M2 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e2m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e5m2.e2m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E5M2 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e4m3_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP32 ACC and inputs E5M2 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e5m2_t, float>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "%16, 0x0;\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M1 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m1.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M1 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m1.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M1 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m1.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M1 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m1.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M1 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m1_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m1.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E3M2 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e3m2.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E3M2 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e3m2.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E3M2 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e3m2.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E3M2 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e3m2.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E3M2 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e3m2_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e3m2.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M3 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m3.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M3 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m3.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M3 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m3.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M3 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m3.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E2M3 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e2m3_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e2m3.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E4M3 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e4m3.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E4M3 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e4m3.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E4M3 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e4m3.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E4M3 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e4m3.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E4M3 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e4m3_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e4m3.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E5M2 x E2M1
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e2m1_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e5m2.e2m1.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E5M2 x E3M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e3m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e5m2.e3m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E5M2 x E2M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e2m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e5m2.e2m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E5M2 x E4M3
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e4m3_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e5m2.e4m3.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN with FP16 ACC and inputs E5M2 x E5M2
+template <>
+struct SM120_SPARSE_16x8x64_TN<float_e5m2_t, float_e5m2_t, half_t>
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = uint32_t[2];
+  using ERegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      uint32_t const& c0, uint32_t const& c1,
+      uint32_t const& e)
+  {
+#if defined(CUTE_ARCH_F8F6F4_MMA_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.kind::f8f6f4.sp::ordered_metadata.m16n8k64.row.col.f16.e5m2.e5m2.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4, %5},"
+      "{%6,  %7,  %8, %9},"
+      "{%10, %11},"
+      "%12, 0x0;\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),  "r"(b2),  "r"(b3),
+         "r"(c0),  "r"(c1),
+         "r"(e));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN without CUTE_ARCH_F8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+} // end namespace SM120::SPARSE
+
+namespace SM120::BLOCKSCALED::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct SM120_SPARSE_16x8x64_TN_VS
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_SPARSE_16x8x64_TN_VS for given data types.");
+};
+
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct SM120_SPARSE_16x8x128_TN_VS
+{
+  static_assert(cutlass::detail::dependent_false<a_type>, "No MMA matches SM120_SPARSE_16x8x128_TN_VS for given data types.");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M1 x E2M1, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M1 x E3M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m1_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m1.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M1 x E2M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m1_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m1.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M1 x E4M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m1_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m1.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M1 x E5M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m1_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m1.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E3M2 x E2M1, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e3m2_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e3m2.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E3M2 x E3M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e3m2_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e3m2.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E3M2 x E2M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e3m2_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e3m2.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E3M2 x E4M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e3m2_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e3m2.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E3M2 x E5M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e3m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e3m2.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M3 x E2M1, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m3_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m3.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M3 x E3M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m3_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m3.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M3 x E2M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m3_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m3.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M3 x E4M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m3_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m3.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E2M3 x E5M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e2m3_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e2m3.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E4M3 x E2M1, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e4m3_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e4m3.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E4M3 x E3M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e4m3_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e4m3.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E4M3 x E2M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e4m3_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e4m3.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E4M3 x E4M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e4m3_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e4m3.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E4M3 x E5M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e4m3_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e4m3.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E5M2 x E2M1, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e5m2_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e5m2.e2m1.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E5M2 x E3M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e5m2_t, float_e3m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e5m2.e3m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E5M2 x E2M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e5m2_t, float_e2m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e5m2.e2m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E5M2 x E4M3, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e5m2_t, float_e4m3_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e5m2.e4m3.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x64 TN with FP32 ACC and inputs E5M2 x E5M2, SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x64_TN_VS<float_e5m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+  using SFARegisters = uint8_t[1];
+  using SFBRegisters = uint8_t[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1, uint32_t const& b2, uint32_t const& b3,
+      float    const& c0, float    const& c1, float    const& c2, float    const& c3,
+      uint32_t const&  e, uint8_t const& sfa, uint8_t  const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64, "Scaling factor vector size has to be 64 for MXF8F6F4 MMA.");
+
+    asm volatile(
+    "mma.sync.aligned.kind::mxf8f6f4.sp::ordered_metadata.block_scale.scale_vec::1X.m16n8k64.row.col.f32.e5m2.e5m2.f32.ue8m0 "
+    "{%0,  %1,  %2,  %3},"
+    "{%4,  %5,  %6,  %7},"
+    "{%8,  %9,  %10, %11},"
+    "{%12, %13, %14, %15},"
+    "{%16}, 0x0,"
+    "{%17},"
+    "{%18, %19},"
+    "{%20},"
+    "{%21,  %22};\n"
+    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+        "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+        "r"(e),
+        "r"((uint32_t)sfa), "h"(bidA), "h"(tidA),
+        "r"((uint32_t)sfb), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120_SPARSE_16x8x64_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x128 TN E2M1 x E2M1 with SF UE8M0
+template <int VS>
+struct SM120_SPARSE_16x8x128_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue8m0_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  static constexpr int SFBits = (VS == 64) ? 16 : 32;
+  using RegTypeSF = uint_bit_t<SFBits>;
+  using SFARegisters = RegTypeSF[1];
+  using SFBRegisters = RegTypeSF[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         &   d1, float          &  d2, float         & d3,
+      uint32_t const& a0, uint32_t  const&  a1, uint32_t  const&  a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t  const&  b1, uint32_t  const&  b2, uint32_t const& b3,
+      float    const& c0, float     const&  c1, float     const&  c2, float    const& c3,
+      uint32_t const&  e, RegTypeSF const& sfa, RegTypeSF const& sfb)
+  {
+
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 64 || VS == 32, "Scaling factor vector size has to be 64 or 32 for MXF4NVF4.");
+#if defined(CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED)
+      asm volatile(
+      "mma.sync.aligned.kind::mxf4nvf4.sp::ordered_metadata.block_scale.scale_vec::2X.m16n8k128.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "{%16}, 0x0,"
+      "{%17}, {%18, %19},"
+      "{%20}, {%21, %22};\n"
+      :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+      :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+          "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+          "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+          "r"(e),
+          "r"(uint32_t(sfa)), "h"(bidA), "h"(tidA),
+          "r"(uint32_t(sfb)), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::SPARSE::SM120_SPARSE_16x8x128_TN_VS without CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA SPARSE BLOCKSCALED 16x8x128 TN E2M1 x E2M1 with SF E4M3
+template <int VS>
+struct SM120_SPARSE_16x8x128_TN_VS<float_e2m1_t, float_e2m1_t, float, float_ue4m3_t, VS>
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[4];
+  using CRegisters = float[4];
+  using ERegisters = uint32_t[1];
+
+  static constexpr int SFBits = (VS == 64) ? 16 : 32;
+  using RegTypeSF = uint_bit_t<SFBits>;
+  using SFARegisters = RegTypeSF[1];
+  using SFBRegisters = RegTypeSF[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         &   d1, float          &  d2, float         & d3,
+      uint32_t const& a0, uint32_t  const&  a1, uint32_t  const&  a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t  const&  b1, uint32_t  const&  b2, uint32_t const& b3,
+      float    const& c0, float     const&  c1, float     const&  c2, float    const& c3,
+      uint32_t const&  e, RegTypeSF const& sfa, RegTypeSF const& sfb)
+  {
+#if defined(CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED)
+    static constexpr uint16_t tidA = 0;
+    static constexpr uint16_t bidA = 0;
+    static constexpr uint16_t tidB = 0;
+    static constexpr uint16_t bidB = 0;
+
+    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for NVF4 with e2m1 and scale factor e4m3.");
+    asm volatile(
+      "mma.sync.aligned.kind::mxf4nvf4.sp::ordered_metadata.block_scale.scale_vec::4X.m16n8k128.row.col.f32.e2m1.e2m1.f32.ue4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "{%16}, 0x0,"
+      "{%17}, {%18, %19},"
+      "{%20}, {%21, %22};\n"
+      :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
+      :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
+          "r"(b0),   "r"(b1),   "r"(b2),   "r"(b3),
+          "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
+          "r"(e),
+          "r"(uint32_t(sfa)), "h"(bidA), "h"(tidA),
+          "r"(uint32_t(sfb)), "h"(bidB), "h"(tidB));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::SPARSE::SM120_SPARSE_16x8x128_TN_VS without CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace SM120::BLOCKSCALED::SPARSE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rr_sparse_op_selector_sm120()
+{
+  // Get MMA SPARSE OP
+  return SM120::SPARSE::SM120_SPARSE_16x8x64_TN<ElementA, ElementB, ElementC>{};
+}
+
+template <
+  class  ElementA,
+  class  ElementB,
+  class  ElementC,
+  class  ElementSF,
+  int    SFVecSize,
+  bool   UseF8F6F4
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rr_blockscaled_sparse_op_selector_sm120()
+{
+  if constexpr (UseF8F6F4) {
+    return SM120::BLOCKSCALED::SPARSE::SM120_SPARSE_16x8x64_TN_VS<ElementA, ElementB, ElementC, ElementSF, SFVecSize>{};
+  }
+  else {
+    return SM120::BLOCKSCALED::SPARSE::SM120_SPARSE_16x8x128_TN_VS<ElementA, ElementB, ElementC, ElementSF, SFVecSize>{};
+  }
+}
+
+} // namespace cute
diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp
index e5eff988..5f65746b 100644
--- a/include/cute/arch/mma_sm90_desc.hpp
+++ b/include/cute/arch/mma_sm90_desc.hpp
@@ -31,15 +31,10 @@
 
 #pragma once
 
-#include <cute/config.hpp>
+#include <cute/arch/config.hpp>
 
 #include <cute/arch/mma.hpp>
 
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-#    define CUTE_ARCH_MMA_SM90A_ENABLED
-#endif
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cute {
diff --git a/include/cute/atom/copy_traits_sm90_im2col.hpp b/include/cute/atom/copy_traits_sm90_im2col.hpp
index e342dbb2..beefa63f 100644
--- a/include/cute/atom/copy_traits_sm90_im2col.hpp
+++ b/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -626,7 +626,7 @@ make_tma_atom_im2col(CopyOp,
   auto tma_layout_trunc = take<0,smem_tma_rank>(tma_layout_full);
 
   // Split according to the portion each multicast CTA will be responsible for
-  auto tma_layout_vt = logical_divide(tma_layout_trunc, shape_div(size(tma_layout_trunc), num_multicast));
+  auto tma_layout_vt = logical_divide(tma_layout_trunc, safe_div(size(tma_layout_trunc), num_multicast));
 
 #if 0
   print("glayout_basis   : "); print(glayout_basis); print("\n");
@@ -748,7 +748,7 @@ make_tma_copy_im2col(CopyOp                       const& copy_op,
   // Scale that up to cover all of the smem_coords
   auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
   // CTA T -> smem idx
-  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  auto layout_t = make_layout(cosize(cta_t_map), safe_div(num_elems_per_tma, cosize(cta_t_map)));
   // CTA TID -> smem coord
   auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
   // Combine with the T mapping
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
index 9c30ca53..ad668cee 100644
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -1165,7 +1165,7 @@ make_tma_copy_tiled(CopyOp                  const& copy_op,
   // Scale that up to cover all of the smem_coords
   auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
   // CTA T -> smem idx
-  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  auto layout_t = make_layout(cosize(cta_t_map), safe_div(num_elems_per_tma, cosize(cta_t_map)));
   // CTA TID -> smem coord
   auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
   // Combine with the T mapping
@@ -1400,16 +1400,19 @@ tma_partition(Copy_Atom<Args...>      const& copy_atom,
 }
 
 // TMA Multicast Masks Calculation
-template <int Mode, class CtaLayout, class CtaCoord>
+template <class CtaLayout, class CtaCoord>
 CUTE_HOST_DEVICE constexpr
 uint16_t
 create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
                           CtaCoord  const& cta_coord_vmnk)
 {
-  auto cta_coord_slicer = replace<Mode>(cta_coord_vmnk, _);
-  auto [cta_layout, elected_cta] = slice_and_offset(cta_coord_slicer, cta_layout_vmnk);
+  auto [cta_layout, elected_cta] = slice_and_offset(cta_coord_vmnk, cta_layout_vmnk);
 
   uint16_t mcast_mask = 0;
+  if constexpr (rank_v<decltype(cta_layout)> == 0) {
+    // Trivial case with no additional ctas
+    mcast_mask = uint16_t(1);
+  } else
   if constexpr (rank_v<decltype(cta_layout)> == 1 and depth_v<decltype(cta_layout)> <= 1 and
                 not is_static<decltype(cta_layout)>::value) {
     // Get the instruction code -- optimized for dynamic flat-rank-1 cta_layout
@@ -1432,6 +1435,16 @@ create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
   return mcast_mask;
 }
 
+// Projections multicast mask
+template <int Mode, int... Modes, class CtaLayout, class CtaCoord>
+CUTE_HOST_DEVICE constexpr
+uint16_t
+create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
+                          CtaCoord  const& cta_coord_vmnk)
+{
+  return create_tma_multicast_mask<Modes...>(cta_layout_vmnk, replace<Mode>(cta_coord_vmnk, _));
+}
+
 ////////////////////////////////////
 // Make TMA copy A/B/C
 ///////////////////////////////////
diff --git a/include/cute/atom/mma_atom.hpp b/include/cute/atom/mma_atom.hpp
index fe2f3e0a..21b8d5d2 100644
--- a/include/cute/atom/mma_atom.hpp
+++ b/include/cute/atom/mma_atom.hpp
@@ -154,9 +154,10 @@ struct MMA_Atom<MMA_Traits<MMAOperation, Args...>>
     if constexpr (has_dereference<FrgTypeA>::value) {
       // If the intended FrgTypeA is a view (of the current tensor), forward the whole
       static_assert(is_same<ValTypeA, typename remove_cvref_t<ATensor>::value_type>::value
-                        
                         || (sizeof_bits_v<typename remove_cvref_t<ATensor>::value_type> == 8 &&
                             (sizeof_bits_v<ValTypeA> == 8 || sizeof_bits_v<ValTypeA> == 6 || sizeof_bits_v<ValTypeA> == 4))
+                        || (sizeof_bits_v<typename remove_cvref_t<ATensor>::value_type> == 4 &&
+                            (sizeof_bits_v<ValTypeA> == 4 || sizeof_bits_v<ValTypeA> == 3 || sizeof_bits_v<ValTypeA> == 2))
                       , "Expecting ValTypeA type");
       return make_tensor<FrgTypeA>(static_cast<ATensor&&>(atensor));
     } else {
@@ -1117,4 +1118,7 @@ print_svg(TiledMMA<Args...> const &mma) {
 #include <cute/atom/mma_traits_sm90.hpp>
 #include <cute/atom/mma_traits_sm90_gmma.hpp>
 #include <cute/atom/mma_traits_sm100.hpp> 
+#include <cute/atom/mma_traits_sm120.hpp>
+#include <cute/atom/mma_traits_sm120_sparse.hpp>
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/atom/mma_traits_sm100.hpp b/include/cute/atom/mma_traits_sm100.hpp
index ff7d5c55..f336eff2 100644
--- a/include/cute/atom/mma_traits_sm100.hpp
+++ b/include/cute/atom/mma_traits_sm100.hpp
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-
-
 #pragma once
 
 #include <cute/config.hpp>
@@ -73,6 +71,41 @@ using Layout_MN_SW128_32B_Atom_Bits = ComposedLayout<Swizzle<2,5,2>, smem_ptr_fl
 template <class Type>
 using Layout_MN_SW128_32B_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW128_32B_Atom_Bits{}));
 
+//////////////////////////////////////////////////
+// Common layouts for Sparse UMMA Shared Memory //
+//////////////////////////////////////////////////
+
+using cute::GMMA::Layout_MN_INTER_SpAtom;
+using cute::GMMA::Layout_MN_SW32_SpAtom;
+using cute::GMMA::Layout_MN_SW64_SpAtom;
+using cute::GMMA::Layout_MN_SW128_SpAtom;
+using cute::GMMA::Layout_K_INTER_SpAtom;
+using cute::GMMA::Layout_K_SW32_SpAtom;
+using cute::GMMA::Layout_K_SW64_SpAtom;
+using cute::GMMA::Layout_K_SW128_SpAtom;
+
+template <class Type, int S>
+using Layout_MN_SW128_32B_SpAtom = ComposedLayout<Swizzle<2,5,2>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                                  decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW128_32B_Atom<Type>{}.layout_b()))>;
+
+// With UMMA::Major param
+template <class Type, int S, UMMA::Major tnsp>
+using Layout_INTER_SpAtom = typename conditional<tnsp == UMMA::Major::MN,
+                                                 Layout_MN_INTER_SpAtom<Type,S>,
+                                                 Layout_K_INTER_SpAtom<Type,S>>::type;
+template <class Type, int S, UMMA::Major tnsp>
+using Layout_SW32_SpAtom = typename conditional<tnsp == UMMA::Major::MN,
+                                                Layout_MN_SW32_SpAtom<Type,S>,
+                                                Layout_K_SW32_SpAtom<Type,S>>::type;
+template <class Type, int S, UMMA::Major tnsp>
+using Layout_SW64_SpAtom = typename conditional<tnsp == UMMA::Major::MN,
+                                                Layout_MN_SW64_SpAtom<Type,S>,
+                                                Layout_K_SW64_SpAtom<Type,S>>::type;
+template <class Type, int S, UMMA::Major tnsp>
+using Layout_SW128_SpAtom = typename conditional<tnsp == UMMA::Major::MN,
+                                                 Layout_MN_SW128_SpAtom<Type,S>,
+                                                 Layout_K_SW128_SpAtom<Type,S>>::type;
+
 // Tile a MN-logical layout atom to an MMA Tile Shape ((MMA_M,MMA_N),M_MMAs,N_MMAs,...)
 template <class LayoutAtom, class MMATileShape, class ModeOrder = GenColMajor>
 CUTE_HOST_DEVICE constexpr
@@ -212,16 +245,11 @@ make_umma_desc(Tensor<TEngine,TLayout> const& tensor)
 
     constexpr int SwizzleAtomKSize = LAYOUT_TYPE == UMMA::LayoutType::SWIZZLE_128B_BASE32B ? 4 : 8;
 
-    // Construct the canonical UMMA T Layout with shape
-    //    ((SwizzleAtomMNSize,n),(SwizzleAtomKSize,2))
-    Layout canonical_layout =
-        logical_divide(layout(u128_tensor),
-                       make_tile(Layout<Int<SwizzleAtomMNSize>, _1>{},
-                                 Layout<Int<SwizzleAtomKSize>,  _1>{}));
+    // Construct the canonical UMMA T Layout with shape ((SwizzleAtomMNSize,n),(SwizzleAtomKSize,2))
+    Layout canonical_layout = logical_divide(layout(u128_tensor), Tile<Layout<Int<SwizzleAtomMNSize>>,Layout<Int<SwizzleAtomKSize>>>{});
 
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical UMMA_MN Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical UMMA_MN Layout: No flat offset mode");
+    // Check profile of canonical
+    CUTE_STATIC_ASSERT_V(congruent(canonical_layout, Shape<Shape<_1,_1>,Shape<_1,_1>>{}), "Not a canonical UMMA_MN Layout: Expected profile failure.");
     // Check canonical mode strides
     constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
     constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == UMMA::LayoutType::SWIZZLE_NONE ? stride<0,0>(canonical_layout) : 1;
@@ -253,11 +281,10 @@ make_umma_desc(Tensor<TEngine,TLayout> const& tensor)
                          "Not a canonical UMMA_K Layout: Expected MN-size multiple of 8.");
 
     // Construct the canonical UMMA N Layout with shape ((8,n),(2,1))
-    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{}));
+    Layout canonical_layout = logical_divide(layout(u128_tensor), Tile<Layout<_8,_1>,Layout<_2,_1>>{});
 
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical UMMA_K Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical UMMA_K Layout: No flat offset mode");
+    // Check profile of canonical
+    CUTE_STATIC_ASSERT_V(congruent(canonical_layout, Shape<Shape<_1,_1>,Shape<_1,_1>>{}), "Not a canonical UMMA_K Layout: Expected profile failure.");
     // Check canonical mode strides
     constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
     constexpr uint32_t expected_stride_00 = SwizzleAtomMNSize;
@@ -1396,6 +1423,182 @@ struct MMA_Traits<SM100_MMA_F16BF16_TS_SCALED<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_TF32_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 4);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 32, "SM100_MMA_TF32_SS_SPARSE supports 32bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_1sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  static constexpr int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint32_t id2 = tmem_e &  0x00000001;
+    tmem_e       = tmem_e & ~0x00000001;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, static_cast<uint16_t>(id2), tmem_e);
+
+    SM100_MMA_TF32_SS_SPARSE<a_type, b_type, c_type,
+                          M, N, a_major, b_major,
+                          a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_TF32_SS_SPARSE<a_type, b_type, c_type,
+                                   M, N, a_major, b_major,
+                                   a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());     // Move to a CoupledTensor rather than a .with()?
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_F16BF16_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 2);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 16, "SM100_MMA_F16BF16_SS_SPARSE supports 16bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_1sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  static constexpr int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint32_t id2 = tmem_e &  0x00000001;
+    tmem_e       = tmem_e & ~0x00000001;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, static_cast<uint16_t>(id2), tmem_e);
+
+    SM100_MMA_F16BF16_SS_SPARSE<a_type, b_type, c_type,
+                         M, N, a_major, b_major,
+                         a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_F16BF16_SS_SPARSE<a_type, b_type, c_type,
+                                  M, N, a_major, b_major,
+                                  a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());     // Move to a CoupledTensor rather than a .with()?
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N,
           UMMA::Major a_major, UMMA::Major b_major,
@@ -1804,6 +2007,182 @@ struct MMA_Traits<SM100_MMA_F16BF16_2x1SM_TS_SCALED<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_TF32_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                             M, N, a_major, b_major,
+                                             a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 4);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 32, "SM100_MMA_TF32_2x1SM_SS_SPARSE supports 32bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  constexpr static int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint32_t id2 = tmem_e &  0x00000001;
+    tmem_e       = tmem_e & ~0x00000001;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, static_cast<uint16_t>(id2), tmem_e);
+
+    SM100_MMA_TF32_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                              M, N, a_major, b_major,
+                              a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_TF32_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_F16BF16_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                             M, N, a_major, b_major,
+                                             a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 2);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 16, "SM100_MMA_F16BF16_2x1SM_SS_SPARSE supports 16bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  constexpr static int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint32_t id2 = tmem_e &  0x00000001;
+    tmem_e       = tmem_e & ~0x00000001;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, static_cast<uint16_t>(id2), tmem_e);
+
+    SM100_MMA_F16BF16_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                              M, N, a_major, b_major,
+                              a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_F16BF16_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::Saturate c_sat>
@@ -1933,6 +2312,94 @@ struct MMA_Traits<SM100_MMA_S8_TS<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N,
+          UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::Saturate c_sat,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_S8_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       c_sat>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 1);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 8, "SM100_MMA_S8_SS_SPARSE supports 8bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_1sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  static constexpr int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, UMMA::ScaleIn::One, UMMA::ScaleIn::One, c_sat, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint32_t id2 = 0;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, static_cast<uint16_t>(id2), tmem_e);
+
+    SM100_MMA_S8_SS_SPARSE<a_type, b_type, c_type,
+                         M, N, a_major, b_major,
+                         c_sat>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_S8_SS_SPARSE<a_type, b_type, c_type,
+                                  M, N, a_major, b_major,
+                                  c_sat>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::Saturate c_sat>
@@ -2062,6 +2529,94 @@ struct MMA_Traits<SM100_MMA_S8_2x1SM_TS<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N,
+          UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::Saturate c_sat,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_S8_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                             M, N, a_major, b_major,
+                                             c_sat>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 1);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> == cute::sizeof_bits_v<b_type> && cute::sizeof_bits_v<b_type> == 8, "SM100_MMA_S8_2x1SM_SS_SPARSE supports 8bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  constexpr static int K = 512 / cute::sizeof_bits<a_type>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, UMMA::ScaleIn::One, UMMA::ScaleIn::One, c_sat, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, id2, tmem_e);
+
+    SM100_MMA_S8_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                              M, N, a_major, b_major,
+                              c_sat>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_S8_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       c_sat>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg>
@@ -2171,7 +2726,7 @@ struct MMA_Traits<SM100_MMA_MXF8F6F4_SS<a_type, b_type, c_type, sf_type,
   using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
                          Stride<_0,Stride<    _1,Int<M>>>>;
   using MMA_ScaleFactor = SM100_MMA_MXF8F6F4_SS<a_type, b_type, c_type, sf_type,
-                                M, (N == 192 ? 256 : N), a_major, b_major,
+                                M, (round_up(N, 128)), a_major, b_major,
                                 a_neg, b_neg>;
 
   // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
@@ -2222,6 +2777,107 @@ struct MMA_Traits<SM100_MMA_MXF8F6F4_SS<a_type, b_type, c_type, sf_type,
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_MXF8F6F4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                          M, N, a_major, b_major,
+                                          a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  using ValTypeSFA = sf_type;
+  using ValTypeSFB = sf_type;
+  static_assert(cute::sizeof_bits_v<a_type> <= 8 && cute::sizeof_bits_v<b_type> <= 8, "SM100_MMA_MXF8F6F4_SS_SPARSE supports types with leq 8bit types");
+
+  // Logical shape-K is always 512bits, transform to units of elements
+  constexpr static int K = 64;
+  constexpr static int SFVecSize = 64;
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<uint8_t>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_1sm<c_type>;
+  using FrgTypeSFA = UMMA::tmem_sf_frg<sf_type, SFVecSize, 1, true>;
+  using FrgTypeSFB = UMMA::tmem_sf_frg<sf_type, SFVecSize, 1, false>;
+
+  static_assert(sizeof_bits_v<ValTypeA> <= sizeof_bits_v<uint8_t> &&
+                sizeof_bits_v<ValTypeB> <= sizeof_bits_v<uint8_t>);
+
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using MMA_ScaleFactor = SM100_MMA_MXF8F6F4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                M, (round_up(N, 128)), a_major, b_major,
+                                a_neg, b_neg>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  uint32_t tsfa_addr_ = 0;
+  uint32_t tsfb_addr_ = 0;
+
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptorBlockScaled idesc_ = UMMA::make_instr_desc_block_scaled<
+    a_type, b_type, c_type, sf_type, M, N, a_major, b_major, a_neg, b_neg, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+              "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc_block_scaled<true>(traits.idesc_, traits.tsfa_addr_, traits.tsfb_addr_, id2, tmem_e);
+
+    SM100_MMA_MXF8F6F4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                            M, N,
+                            a_major, b_major,
+                            a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, traits.tsfa_addr_, traits.tsfb_addr_, tmem_e);
+  }
+
+  // Construct an executable MMA_traits with sp into set.
+  template <class TE, class TELayout, class TSFA, class TSFALayout, class TSFB, class TSFBLayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_MXF8F6F4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                              M, N, a_major, b_major, a_neg, b_neg>, uint32_t>
+  with(UMMA::ScaleOut accumulate, Tensor<TE, TELayout> const& E, Tensor<TSFA, TSFALayout> const& SFA, Tensor<TSFB, TSFBLayout> const& SFB) const {
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    uint32_t tmem_sfa_addr = raw_pointer_cast(SFA.data());
+    uint32_t tmem_sfb_addr = raw_pointer_cast(SFB.data());
+    return {accumulate, tmem_sfa_addr, tmem_sfb_addr, {tmem_e_addr}, idesc_};
+  }
+};
 
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
@@ -2291,6 +2947,94 @@ struct MMA_Traits<SM100_MMA_F8F6F4_TS<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N,
+          UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_F8F6F4_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 1);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> <= 8 && cute::sizeof_bits_v<b_type> <= 8, "SM100_MMA_F8F6F4_SS_SPARSE supports types with leq 8bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<uint8_t>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_1sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  static constexpr int K = 512 / cute::sizeof_bits<uint8_t>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, id2, tmem_e);
+
+    SM100_MMA_F8F6F4_SS_SPARSE<a_type, b_type, c_type,
+                         M, N, a_major, b_major,
+                         a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_F8F6F4_SS_SPARSE<a_type, b_type, c_type,
+                                  M, N, a_major, b_major,
+                                  a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
+
 template <class a_type, class b_type, class c_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
           UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg>
@@ -2427,6 +3171,92 @@ struct MMA_Traits<SM100_MMA_F8F6F4_2x1SM_TS<a_type, b_type, c_type,
   }
 };
 
+template <class a_type, class b_type, class c_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_F8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                             M, N, a_major, b_major,
+                                             a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 1);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> <= 8 && cute::sizeof_bits_v<b_type> <= 8, "SM100_MMA_F8F6F4_2x1SM_SS_SPARSE supports types with leq 8bit types");
+
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<uint8_t>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  constexpr static int K = 512 / cute::sizeof_bits<uint8_t>::value;
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptor idesc_ = UMMA::make_instr_desc<
+    a_type, b_type, c_type, M, N, a_major, b_major, a_neg, b_neg, UMMA::Saturate::False, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_same<cute::tuple<sparse_args...>, cute::tuple<uint32_t>>::value,
+                  "Params must be set via .with()?");
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc<true>(traits.idesc_, id2, tmem_e);
+
+    SM100_MMA_F8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                              M, N, a_major, b_major,
+                              a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class ELayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_F8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type,
+                                       M, N, a_major, b_major,
+                                       a_neg, b_neg>, uint32_t>
+  with(Tensor<TE, ELayout> const& E, uint32_t id2 = 0) const {
+    // Check sparse_ptr, check sparsity, check shape/layout?
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    return {accumulate_, {tmem_e_addr}, idesc_};
+  }
+};
 
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, UMMA::Major a_major, UMMA::Major b_major,
@@ -2468,7 +3298,7 @@ struct MMA_Traits<SM100_MMA_MXF8F6F4_2x1SM_SS<a_type, b_type, c_type, sf_type,
   using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
                          Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
   using MMA_ScaleFactor = SM100_MMA_MXF8F6F4_SS<a_type, b_type, c_type, sf_type,
-                                (M/2 > 64 ? M/2 : M), (N == 192 ? 256 : N), a_major, b_major,
+                                (M/2 > 64 ? M/2 : M), (round_up(N, 128)), a_major, b_major,
                                 a_neg, b_neg>;
 
   // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
@@ -2519,7 +3349,106 @@ struct MMA_Traits<SM100_MMA_MXF8F6F4_2x1SM_SS<a_type, b_type, c_type, sf_type,
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                      M, N, a_major, b_major,
+                                      a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD = c_type;
+  static_assert(sizeof(a_type) == 1);
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = b_type;
+  using ValTypeC = c_type;
+  static_assert(cute::sizeof_bits_v<a_type> <= 8 && cute::sizeof_bits_v<b_type> <= 8, "SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE supports types with leq 8bit types");
 
+  using FrgTypeA = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE = UMMA::tmem_e_frg<uint8_t>;
+  using FrgTypeB = UMMA::smem_desc<b_major>;
+  using FrgTypeC = UMMA::tmem_frg_2sm<c_type>;
+
+  // SparseMma consume double mma-k bits
+  constexpr static int K = 64;
+  constexpr static int SFVecSize = 64;
+
+  constexpr static UMMA::TmemAllocMode TmemAlloc = M == 128 ?
+      UMMA::TmemAllocMode::ScaleFactorDuplicated2by2 : UMMA::TmemAllocMode::ScaleFactorDuplicated4by1;
+  using FrgTypeSFA = UMMA::tmem_sf_frg<sf_type, SFVecSize, 2,  true, TmemAlloc>;
+  using FrgTypeSFB = UMMA::tmem_sf_frg<sf_type, SFVecSize, 2, false, TmemAlloc>;
+
+  static_assert(sizeof_bits_v<ValTypeA> <= sizeof_bits_v<uint8_t> &&
+                sizeof_bits_v<ValTypeB> <= sizeof_bits_v<uint8_t>);
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using MMA_ScaleFactor = SM100_MMA_MXF8F6F4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                (M/2 > 64 ? M/2 : M), (round_up(N, 128)), a_major, b_major,
+                                a_neg, b_neg>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  uint32_t tsfa_addr_ = 0;
+  uint32_t tsfb_addr_ = 0;
+
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptorBlockScaled idesc_ = UMMA::make_instr_desc_block_scaled<
+    a_type, b_type, c_type, sf_type, M, N, a_major, b_major, a_neg, b_neg, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc_block_scaled<true>(traits.idesc_, traits.tsfa_addr_, traits.tsfb_addr_, id2, tmem_e);
+
+    SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                          M, N,
+                          a_major, b_major,
+                          a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, traits.tsfa_addr_, traits.tsfb_addr_, tmem_e);
+  }
+
+  // Construct an executable MMA_traits with sp into set.
+  template <class TE, class TELayout, class TSFA, class TSFALayout, class TSFB, class TSFBLayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                M, N, a_major, b_major, a_neg, b_neg>, uint32_t>
+  with(UMMA::ScaleOut accumulate, Tensor<TE, TELayout> const& E, Tensor<TSFA, TSFALayout> const& SFA, Tensor<TSFB, TSFBLayout> const& SFB) const {
+    uint32_t tmem_e_addr = raw_pointer_cast(E.data());
+    uint32_t tmem_sfa_addr = raw_pointer_cast(SFA.data());
+    uint32_t tmem_sfb_addr = raw_pointer_cast(SFB.data());
+    return {accumulate, tmem_sfa_addr, tmem_sfb_addr, {tmem_e_addr}, idesc_};
+  }
+};
 
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
@@ -2561,7 +3490,7 @@ struct MMA_Traits<SM100_MMA_MXF4_SS<a_type, b_type, c_type, sf_type,
   using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
                          Stride<_0,Stride<    _1,Int<M>>>>;
   using MMA_ScaleFactor = SM100_MMA_MXF4_SS<a_type, b_type, c_type, sf_type,
-                                M, (N == 192 ? 256 : N), VS, a_major, b_major,
+                                M, (round_up(N, 128)), VS, a_major, b_major,
                                 a_neg, b_neg>;
 
   // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
@@ -2612,7 +3541,107 @@ struct MMA_Traits<SM100_MMA_MXF4_SS<a_type, b_type, c_type, sf_type,
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_MXF4NVF4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                M, N, VS, a_major, b_major,
+                                a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD   = c_type;
+  using ValTypeA   = sparse_elem<4, uint8_t>;
+  using ValTypeE   = sparse_elem<16, uint8_t>;
+  using ValTypeB   = b_type;
+  using ValTypeC   = c_type;
+  using ValTypeSFA = sf_type;
+  using ValTypeSFB = sf_type;
+  static_assert(cute::sizeof_bits_v<a_type> == 4 && cute::sizeof_bits_v<b_type> == 4, "SM100_MMA_MXF4NVF4_SS_SPARSE supports 4bit types");
 
+  // Logical shape-K is always 256bits, transform to units of elements
+  constexpr static int K = 128;
+  constexpr static int SFVecSize = VS;
+
+  using FrgTypeA   = UMMA::sparse_smem_desc<a_major>;
+  using FrgTypeE   = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB   = UMMA::smem_desc<b_major>;
+  using FrgTypeC   = UMMA::tmem_frg_1sm<c_type>;
+  using FrgTypeSFA = UMMA::tmem_sf_frg<sf_type, SFVecSize, 1, true>;
+  using FrgTypeSFB = UMMA::tmem_sf_frg<sf_type, SFVecSize, 1, false>;
+
+  static_assert((VS == 64 && ((is_same_v<a_type, cutlass::float_e2m1_t> || is_same_v<a_type, cutlass::type_erased_dynamic_float4_t>) &&
+                              (is_same_v<b_type, cutlass::float_e2m1_t> || is_same_v<b_type, cutlass::type_erased_dynamic_float4_t>))
+                          &&   is_same_v<sf_type, cutlass::float_ue8m0_t>)
+             || (VS == 32),
+       "2x mode (VectorSize=64) only supports a_type and b_type=float_e2m1_t or cutlass::type_erased_dynamic_float4_t and sf_type=ue8m0_t");
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape <_1,Shape <Int<M>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using BLayout = Layout<Shape <_1,Shape <Int<N>,Int<K>>>,
+                         Stride<_0,Stride<    _1,Int<N>>>>;
+  using CLayout = Layout<Shape <_1,Shape <Int<M>,Int<N>>>,
+                         Stride<_0,Stride<    _1,Int<M>>>>;
+  using MMA_ScaleFactor = SM100_MMA_MXF4NVF4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                M, (round_up(N, 128)), VS, a_major, b_major,
+                                a_neg, b_neg>;
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  uint32_t tsfa_addr_ = 0;
+  uint32_t tsfb_addr_ = 0;
+
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptorBlockScaled idesc_ = UMMA::make_instr_desc_block_scaled<
+    a_type, b_type, c_type, sf_type, M, N, a_major, b_major, a_neg, b_neg, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc_block_scaled<true>(traits.idesc_, traits.tsfa_addr_, traits.tsfb_addr_, id2, tmem_e);
+
+    SM100_MMA_MXF4NVF4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                  M, N, VS,
+                  a_major, b_major,
+                  a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, traits.tsfa_addr_, traits.tsfb_addr_, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class TELayout, class TSFA, class TSFALayout, class TSFB, class TSFBLayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_MXF4NVF4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                              M, N, VS, a_major, b_major, a_neg, b_neg>, uint32_t>
+  with(UMMA::ScaleOut accumulate, Tensor<TE, TELayout> const& E, Tensor<TSFA, TSFALayout> const& SFA, Tensor<TSFB, TSFBLayout> const& SFB) const {
+    uint32_t tmem_e_addr   = raw_pointer_cast(E.data());
+    uint32_t tmem_sfa_addr = raw_pointer_cast(SFA.data());     // Move to a CoupledTensor rather than a .with()?
+    uint32_t tmem_sfb_addr = raw_pointer_cast(SFB.data());     // Move to a CoupledTensor rather than a .with()?
+    return {accumulate, tmem_sfa_addr, tmem_sfb_addr, {tmem_e_addr}, idesc_};
+  }
+};
 
 template <class a_type, class b_type, class c_type, class sf_type,
           int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
@@ -2651,7 +3680,7 @@ struct MMA_Traits<SM100_MMA_MXF4_2x1SM_SS<a_type, b_type, c_type, sf_type,
   using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
                          Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
   using MMA_ScaleFactor = SM100_MMA_MXF4_SS<a_type, b_type, c_type, sf_type,
-                                (M/2 > 64 ? M/2 : M), (N == 192 ? 256 : N), VS, a_major, b_major,
+                                (M/2 > 64 ? M/2 : M), (round_up(N, 128)), VS, a_major, b_major,
                                 a_neg, b_neg>;
 
   // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
@@ -2703,5 +3732,110 @@ struct MMA_Traits<SM100_MMA_MXF4_2x1SM_SS<a_type, b_type, c_type, sf_type,
   }
 };
 
+template <class a_type, class b_type, class c_type, class sf_type,
+          int M, int N, int VS, UMMA::Major a_major, UMMA::Major b_major,
+          UMMA::ScaleIn a_neg, UMMA::ScaleIn b_neg,
+          class... sparse_args>
+struct MMA_Traits<SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                M, N, VS, a_major, b_major,
+                                a_neg, b_neg>, sparse_args...>
+{
+  using ValTypeD   = c_type;
+  using ValTypeA = sparse_elem<4, uint8_t>;
+  using ValTypeE = sparse_elem<16, uint8_t>;
+  using ValTypeB   = b_type;
+  using ValTypeC   = c_type;
+  using ValTypeSFA = sf_type;
+  using ValTypeSFB = sf_type;
+  static_assert(cute::sizeof_bits_v<a_type> == 4 && cute::sizeof_bits_v<b_type> == 4, "SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE supports 4bit types");
+
+  // Logical shape-K is always 256bits, transform to units of elements
+  constexpr static int K = 128;
+  constexpr static int SFVecSize = VS;
+
+  constexpr static UMMA::TmemAllocMode TmemAlloc = M == 128 ?
+      UMMA::TmemAllocMode::ScaleFactorDuplicated2by2 : UMMA::TmemAllocMode::ScaleFactorDuplicated4by1;
+  using FrgTypeA   = UMMA::sparse_smem_desc<a_major>;
+  // using FrgTypeE = UMMA::tmem_e_frg<uint8_t>;
+  using FrgTypeE   = UMMA::tmem_e_frg<a_type>;
+  using FrgTypeB   = UMMA::smem_desc<b_major>;
+  using FrgTypeC   = UMMA::tmem_frg_2sm<c_type>;
+  using FrgTypeSFA = UMMA::tmem_sf_frg<sf_type, SFVecSize, 2,  true, TmemAlloc>;
+  using FrgTypeSFB = UMMA::tmem_sf_frg<sf_type, SFVecSize, 2, false, TmemAlloc>;
+
+  static_assert((VS == 64 && ((is_same_v<a_type, cutlass::float_e2m1_t> || is_same_v<a_type, cutlass::type_erased_dynamic_float4_t>) &&
+                              (is_same_v<b_type, cutlass::float_e2m1_t> || is_same_v<b_type, cutlass::type_erased_dynamic_float4_t>))
+                          &&   is_same_v<sf_type, cutlass::float_ue8m0_t>)
+             || (VS == 32),
+       "2x mode (VectorSize=64) only supports a_type and b_type=float_e2m1_t or cutlass::type_erased_dynamic_float4_t and sf_type=ue8m0_t");
+
+  using Shape_MNK = Shape<Int<M>,Int<N>,Int<K>>;
+  using ThrID   = Layout<_2>;
+  using ALayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<K>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using BLayout = Layout<Shape <      _2,Shape <Int<N/2>,Int<K>>>,
+                         Stride<Int<N/2>,Stride<      _1,Int<N>>>>;
+  using CLayout = Layout<Shape <      _2,Shape <Int<M/2>,Int<N>>>,
+                         Stride<Int<M/2>,Stride<      _1,Int<M>>>>;
+  using MMA_ScaleFactor = SM100_MMA_MXF4NVF4_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                                (M/2 > 64 ? M/2 : M), (round_up(N, 128)), VS, a_major, b_major,
+                                a_neg, b_neg>;
+
+
+  // Accumulate or overwrite C.   1: read C, 0: ignore C [clear accumulators]
+  UMMA::ScaleOut accumulate_ = UMMA::ScaleOut::One;
+  uint32_t tsfa_addr_ = 0;
+  uint32_t tsfb_addr_ = 0;
+
+  // uint32_t tmem_e: Metadata tmem address.
+  cute::tuple<sparse_args...> sparse_args_;
+
+  UMMA::InstrDescriptorBlockScaled idesc_ = UMMA::make_instr_desc_block_scaled<
+    a_type, b_type, c_type, sf_type, M, N, a_major, b_major, a_neg, b_neg, true>();
+
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr friend
+  void
+  mma_unpack(MMA_Traits          const& traits,
+             Tensor<TD, DLayout>      & D,
+             Tensor<TA, ALayout> const& A,
+             Tensor<TB, BLayout> const& B,
+             Tensor<TC, CLayout> const& C)
+  {
+    static_assert(is_tmem<TD>::value, "Expected tmem in MMA_Atom::call");
+    static_assert(is_rmem<TA>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_rmem<TB>::value, "Expected desc registers in MMA_Atom::call");
+    static_assert(is_tmem<TC>::value, "Expected tmem in MMA_Atom::call");
+
+    uint64_t desc_a = A[0];
+    uint64_t desc_b = B[0];
+    uint32_t tmem_c = raw_pointer_cast(D.data());
+
+    uint32_t tmem_e = get<0>(traits.sparse_args_);
+    uint16_t id2    = 0u;
+
+    uint64_t idesc = UMMA::make_runtime_instr_desc_block_scaled<true>(traits.idesc_, traits.tsfa_addr_, traits.tsfb_addr_, id2, tmem_e);
+
+    SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                  M, N, VS,
+                  a_major, b_major,
+                  a_neg, b_neg>::fma(desc_a, desc_b, tmem_c, uint32_t(traits.accumulate_), idesc, traits.tsfa_addr_, traits.tsfb_addr_, tmem_e);
+  }
+
+  // Construct an executable sparse MMA_traits with sp into set.
+  template <class TE, class TELayout, class TSFA, class TSFALayout, class TSFB, class TSFBLayout>
+  CUTE_HOST_DEVICE constexpr
+  MMA_Traits<SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE<a_type, b_type, c_type, sf_type,
+                              M, N, VS, a_major, b_major, a_neg, b_neg>, uint32_t>
+  with(UMMA::ScaleOut accumulate, Tensor<TE, TELayout> const& E, Tensor<TSFA, TSFALayout> const& SFA, Tensor<TSFB, TSFBLayout> const& SFB) const {
+    uint32_t tmem_e_addr   = raw_pointer_cast(E.data());
+    uint32_t tmem_sfa_addr = raw_pointer_cast(SFA.data());     // Move to a CoupledTensor rather than a .with()?
+    uint32_t tmem_sfb_addr = raw_pointer_cast(SFB.data());     // Move to a CoupledTensor rather than a .with()?
+    return {accumulate, tmem_sfa_addr, tmem_sfb_addr, {tmem_e_addr}, idesc_};
+  }
+};
 
 } // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm120.hpp b/include/cute/atom/mma_traits_sm120.hpp
new file mode 100644
index 00000000..e3399801
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm120.hpp
@@ -0,0 +1,262 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/arch/mma_sm120.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/atom/mma_traits_sm80.hpp>
+#include <cute/layout.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+namespace SM120::BLOCKSCALED {
+
+template <class MMAOp,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr void
+mma_unpack(MMA_Traits<MMAOp>   const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A_zipped,
+           Tensor<TB, BLayout> const& B_zipped,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
+
+  // Register value types from the MMA_Operation register arrays
+  using          RegTypeD = typename remove_extent<typename MMAOp::DRegisters>::type;
+  using          RegTypeA = typename remove_extent<typename MMAOp::ARegisters>::type;
+  using          RegTypeB = typename remove_extent<typename MMAOp::BRegisters>::type;
+  using          RegTypeC = typename remove_extent<typename MMAOp::CRegisters>::type;
+  using        RegTypeSFA = typename remove_extent<typename MMAOp::SFARegisters>::type;
+  using        RegTypeSFB = typename remove_extent<typename MMAOp::SFBRegisters>::type;
+
+  constexpr int   RegNumD = extent<typename MMAOp::DRegisters>::value;
+  constexpr int   RegNumA = extent<typename MMAOp::ARegisters>::value;
+  constexpr int   RegNumB = extent<typename MMAOp::BRegisters>::value;
+  constexpr int   RegNumC = extent<typename MMAOp::CRegisters>::value;
+  constexpr int RegNumSFA = extent<typename MMAOp::SFARegisters>::value;
+  constexpr int RegNumSFB = extent<typename MMAOp::SFBRegisters>::value;
+
+  auto  [A, SFA] = unzip_tensor(A_zipped);
+  auto  [B, SFB] = unzip_tensor(B_zipped);
+  
+  using Shape_MNK = typename MMA_Traits<MMAOp>::Shape_MNK;
+  constexpr int SFVecSize = MMA_Traits<MMAOp>::SFVecSize;
+  
+  // Assert logical size
+  CUTE_STATIC_ASSERT_V(size(SFA) == size<2>(Shape_MNK{}));
+  CUTE_STATIC_ASSERT_V(size(SFB) == size<2>(Shape_MNK{})); 
+
+  // Assert physical size
+  CUTE_STATIC_ASSERT(decltype(cosize(layout(SFA))){} == size<2>(Shape_MNK{}) / SFVecSize); 
+  CUTE_STATIC_ASSERT(decltype(cosize(layout(SFB))){} == size<2>(Shape_MNK{}) / SFVecSize); 
+
+  Tensor rA = recast<RegTypeA>(A);
+  Tensor rB = recast<RegTypeB>(B);
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+
+  Tensor rD = recast<RegTypeD>(D);
+  Tensor rC = recast<RegTypeC>(C);
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumD>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  Tensor rSFA = recast<RegTypeSFA>(filter_zeros(SFA));
+  Tensor rSFB = recast<RegTypeSFB>(filter_zeros(SFB));
+
+  CUTE_STATIC_ASSERT_V(size(rSFA) == Int<RegNumSFA>{});
+  CUTE_STATIC_ASSERT_V(size(rSFB) == Int<RegNumSFB>{});
+
+  detail::explode(MMAOp::fma,
+            rD,   make_int_sequence<RegNumD>{},
+            rA,   make_int_sequence<RegNumA>{},
+            rB,   make_int_sequence<RegNumB>{},
+            rC,   make_int_sequence<RegNumC>{},
+            rSFA, make_int_sequence<RegNumSFA>{},
+            rSFB, make_int_sequence<RegNumSFB>{});
+}
+} // namespace SM120::BLOCKSCALED
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA F8F6F4 16x8x32 TN
+template <class a_type, class b_type, class c_type>
+struct MMA_Traits<SM120_16x8x32_TN<a_type, b_type, c_type>>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  // The MMA accepts 8-bit inputs regardless of the types for A and B
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+
+  using ValTypeD = c_type;
+  using ValTypeC = c_type;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA MXF8F6F4 16x8x64 TN
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct MMA_Traits<SM120::BLOCKSCALED::SM120_16x8x64_TN_VS<a_type, b_type, c_type, sf_type, VS>>
+{
+  // The MMA accepts 4-bit inputs regardless of the types for A and B
+  using ValTypeA = uint4_t;
+  using ValTypeB = uint4_t;
+
+  using ValTypeD = c_type;
+  using ValTypeC = c_type;
+
+  using ValTypeSF = sf_type;
+  constexpr static int SFVecSize = VS;
+
+  using Shape_MNK = Shape<_16,_8,_64>;
+  using ThrID     = Layout<_32>;
+
+  // (T32,V32) -> (M16,K64)
+  using ALayout   = Layout<Shape <Shape <  _4,_8>,Shape < _8,_2,  _2>>,
+                           Stride<Stride<_128,_1>,Stride<_16,_8,_512>>>;
+  // (T32,V16) -> (M16,K64)
+  using BLayout   = Layout<Shape <Shape < _4,_8>,Shape <_8,  _2>>,
+                           Stride<Stride<_64,_1>,Stride<_8,_256>>>;
+  // (T32,V64) -> (M16,K64)
+  using SFALayout = Layout<Shape <Shape <_2,_2,_8>,_64>,  // Effectively 16 threads due to the 2:0 mode
+                           Stride<Stride<_8,_0,_1>,_16>>;
+  // (T32,V64) -> (N8,K64)
+  using SFBLayout = Layout<Shape <Shape <_4,_8>,_64>,     // Effectively 8 threads due to the 4:0 mode
+                           Stride<Stride<_0,_1>, _8>>;
+  // (T32,V4)  -> (M16,N8)
+  using CLayout   = SM80_16x8_Row;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA MXF8F6F4 16x8x32 TN
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct MMA_Traits<SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<a_type, b_type, c_type, sf_type, VS>>
+{
+  using UnderlyingTraits = MMA_Traits<SM120_16x8x32_TN<a_type, b_type, c_type>>;
+
+  // The MMA accepts 8-bit inputs regardless of the types for A and B
+  using ValTypeA = typename UnderlyingTraits::ValTypeA;
+  using ValTypeB = typename UnderlyingTraits::ValTypeB;
+
+  using ValTypeD = typename UnderlyingTraits::ValTypeD;
+  using ValTypeC = typename UnderlyingTraits::ValTypeC;
+
+  using Shape_MNK = typename UnderlyingTraits::Shape_MNK;
+  using ThrID     = typename UnderlyingTraits::ThrID;
+
+  using ALayout   = typename UnderlyingTraits::ALayout;
+  using BLayout   = typename UnderlyingTraits::BLayout;
+  using CLayout   = typename UnderlyingTraits::CLayout;
+
+  // Scaling factor
+  using ValTypeSF = sf_type;
+  constexpr static int SFVecSize = VS;
+
+  // (T32,V32) -> (M16,K32)
+  using SFALayout = Layout<Shape <Shape <_2,_2,_8>,_32>,  // Effectively 16 threads due to the 2:0 mode
+                           Stride<Stride<_8,_0,_1>,_16>>;
+  // (T32,V32) -> (N8,K32)
+  using SFBLayout = Layout<Shape <Shape <_4,_8>,_32>,     // Effectively 8 threads due to the 4:0 mode
+                           Stride<Stride<_0,_1>, _8>>;
+};
+
+// Transform if needed
+template<class MMA_Op, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_A(MMA_Op const& op, Tensor&& tensor) {
+}
+template<class MMA_Op, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_B(MMA_Op const& op, Tensor&& tensor) {
+}
+
+// For SM120 MMA F8F6F4 input fp4, the operand A/B are load from ld.matrix. 
+// ld.matrix b4x16_p64 places FP4 data at the first four bits in each
+// eight-bit container, whereas MMA F8F6F4 expects the four-bit data to be in 
+// the middle of the eight-bit container. Thus, e2m1 operands being fed
+// to MMA F8F6F4 must be shifted left by two bits.
+// 0b0000ABCD --> 0b00ABCD00
+// NOTE: Same transformation is NOT needed for FP6 and FP8.
+template<class AType, class BType, class... MMAArgs, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_A(SM120_16x8x32_TN<AType, BType, MMAArgs ...> const&, Tensor&& tensor) {
+  using RegisterTypeA = typename remove_extent<typename
+                        SM120_16x8x32_TN<AType, BType, MMAArgs ...>::ARegisters>::type;
+  if constexpr (cute::is_same_v<AType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeA>(tensor), [](RegisterTypeA& v){ return v << 2; });
+  }
+}
+template<class AType, class BType, class... MMAArgs, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_B(SM120_16x8x32_TN<AType, BType, MMAArgs ...> const&, Tensor&& tensor) {
+  using RegisterTypeB = typename remove_extent<typename
+                        SM120_16x8x32_TN<AType, BType, MMAArgs ...>::BRegisters>::type;
+  if constexpr (cute::is_same_v<BType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeB>(tensor), [](RegisterTypeB& v){ return v << 2; });
+  }
+}
+
+namespace SM120::BLOCKSCALED {
+
+// Template function with scale factor needs to enmuerate types one by one, as template 
+// arguments contatins two variadic lists, which cannot be deduced in one shot.
+template<class AType, class BType, class CType, class SFType, int VS, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_A(SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<AType, BType, CType, SFType, VS> const&, Tensor&& tensor) {
+  using RegisterTypeA = typename remove_extent<typename
+                        SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<AType, BType, CType, SFType, VS>::ARegisters>::type;
+  if constexpr (cute::is_same_v<AType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeA>(tensor), [](RegisterTypeA& v){ return v << 2; });
+  }
+}
+template<class AType, class BType, class CType, class SFType, int VS, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_B(SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<AType, BType, CType, SFType, VS> const&, Tensor&& tensor) {
+  using RegisterTypeB = typename remove_extent<typename
+                        SM120::BLOCKSCALED::SM120_16x8x32_TN_VS<AType, BType, CType, SFType, VS>::BRegisters>::type;
+  if constexpr (cute::is_same_v<BType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeB>(tensor), [](RegisterTypeB& v){ return v << 2; });
+  }
+}
+
+}
+
+} // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm120_sparse.hpp b/include/cute/atom/mma_traits_sm120_sparse.hpp
new file mode 100644
index 00000000..b62c576d
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm120_sparse.hpp
@@ -0,0 +1,326 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/arch/mma_sm120.hpp>
+#include <cute/arch/mma_sm120_sparse.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+namespace {
+
+// (T32,V4) -> (M16,N8)
+using SM120_16x8_Row = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                             Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+
+}
+
+namespace SM120::BLOCKSCALED::SPARSE
+{
+
+// Unpack explode/mma call with sparse and block scalaring inputs.
+template <class MMAOp,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr void
+mma_unpack(MMA_Traits<MMAOp>  const&,
+          Tensor<TD, DLayout>      & D,
+          Tensor<TA, ALayout> const& A,
+          Tensor<TB, BLayout> const& B,
+          Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem_v<TD>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TA>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TB>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TC>, "Expected registers in MMA_Atom::call");
+  using         DRegisters = typename MMAOp::DRegisters;
+  using         ARegisters = typename MMAOp::ARegisters;
+  using         ERegisters = typename MMAOp::ERegisters;
+  using         BRegisters = typename MMAOp::BRegisters;
+  using         CRegisters = typename MMAOp::CRegisters;
+  using         SFARegisters = typename MMAOp::SFARegisters;
+  using         SFBRegisters = typename MMAOp::SFBRegisters;
+  // Register value types from the MMAOp register arrays
+  using         RegTypeD   = typename remove_extent<DRegisters>::type;
+  using         RegTypeA   = typename remove_extent<ARegisters>::type;
+  using         RegTypeE   = typename remove_extent<ERegisters>::type;
+  using         RegTypeB   = typename remove_extent<BRegisters>::type;
+  using         RegTypeC   = typename remove_extent<CRegisters>::type;
+  using         RegTypeSFA = typename remove_extent<SFARegisters>::type;
+  using         RegTypeSFB = typename remove_extent<SFBRegisters>::type;
+  constexpr int RegNumD    = extent<DRegisters>::value;
+  constexpr int RegNumA    = extent<ARegisters>::value;
+  constexpr int RegNumE    = extent<ERegisters>::value;
+  constexpr int RegNumB    = extent<BRegisters>::value;
+  constexpr int RegNumC    = extent<CRegisters>::value;
+  constexpr int RegNumSFA  = extent<SFARegisters>::value;
+  constexpr int RegNumSFB  = extent<SFBRegisters>::value;
+
+  auto  [tA, tSFA, tE] = unzip_tensor(A);
+  auto  [tB, tSFB    ] = unzip_tensor(B);
+  Tensor rA      = recast<RegTypeA>(tA);
+  Tensor rE      = recast<RegTypeE>(tE);
+  Tensor rB      = recast<RegTypeB>(tB);
+  Tensor rD      = recast<RegTypeD>(D);
+  Tensor rC      = recast<RegTypeC>(C);
+  Tensor rSFA    = recast<RegTypeSFA>(tSFA);
+  Tensor rSFB    = recast<RegTypeSFB>(tSFB);
+
+  CUTE_STATIC_ASSERT_V(size(rA)   == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rE)   == Int<RegNumE>{});
+  CUTE_STATIC_ASSERT_V(size(rB)   == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rD)   == Int<RegNumD>{});
+  CUTE_STATIC_ASSERT_V(size(rC)   == Int<RegNumC>{});
+  CUTE_STATIC_ASSERT_V(size(filter_zeros(rSFA)) == Int<RegNumSFA>{});
+  CUTE_STATIC_ASSERT_V(size(filter_zeros(rSFB)) == Int<RegNumSFB>{});
+
+  detail::explode(MMAOp::fma,
+                  rD, make_int_sequence<RegNumD>{},
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  rE, make_int_sequence<RegNumE>{},
+                  rSFA, make_int_sequence<RegNumSFA>{},
+                  rSFB, make_int_sequence<RegNumSFB>{});
+}
+
+} // end namespace SM120::BLOCKSCALED::SPARSE
+
+
+namespace SM120::SPARSE
+{
+
+template <class MMAOp,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr void
+mma_unpack(MMA_Traits<MMAOp>  const&,
+          Tensor<TD, DLayout>      & D,
+          Tensor<TA, ALayout> const& A,
+          Tensor<TB, BLayout> const& B,
+          Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem_v<TD>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TA>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TB>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TC>, "Expected registers in MMA_Atom::call");
+  using         DRegisters = typename MMAOp::DRegisters;
+  using         ARegisters = typename MMAOp::ARegisters;
+  using         ERegisters = typename MMAOp::ERegisters;
+  using         BRegisters = typename MMAOp::BRegisters;
+  using         CRegisters = typename MMAOp::CRegisters;
+  // Register value types from the MMAOp register arrays
+  using         RegTypeD   = typename remove_extent<DRegisters>::type;
+  using         RegTypeA   = typename remove_extent<ARegisters>::type;
+  using         RegTypeE   = typename remove_extent<ERegisters>::type;
+  using         RegTypeB   = typename remove_extent<BRegisters>::type;
+  using         RegTypeC   = typename remove_extent<CRegisters>::type;
+  constexpr int RegNumD    = extent<DRegisters>::value;
+  constexpr int RegNumA    = extent<ARegisters>::value;
+  constexpr int RegNumE    = extent<ERegisters>::value;
+  constexpr int RegNumB    = extent<BRegisters>::value;
+  constexpr int RegNumC    = extent<CRegisters>::value;
+
+  auto  [tA, tE] = unzip_tensor(A);
+  Tensor rA      = recast<RegTypeA>(tA);
+  Tensor rE      = recast<RegTypeE>(tE);
+  Tensor rB      = recast<RegTypeB>(B);
+  Tensor rD      = recast<RegTypeD>(D);
+  Tensor rC      = recast<RegTypeC>(C);
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rE) == Int<RegNumE>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumD>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMAOp::fma,
+                  rD, make_int_sequence<RegNumD>{},
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  rE, make_int_sequence<RegNumE>{});
+}
+
+} // end namespace SM120::SPARSE
+
+// sparse F8F6F4 without block-scaling
+template <class a_type, class b_type, class c_type>
+struct MMA_Traits<SM120::SPARSE::SM120_SPARSE_16x8x64_TN<a_type, b_type, c_type>>
+{
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using FrgTypeA = sparse_elem<2, uint8_t>;
+  using FrgTypeE = sparse_elem<8, uint8_t>;
+
+  using ValTypeC = c_type;
+  using ValTypeD = c_type;
+
+  using Shape_MNK = Shape<_16, _8, _64>;
+  using ThrID     = Layout<_32>;
+  // (T32,V32) -> (M16,K64)
+  using ALayout   = Layout<Shape <Shape <  _4,_8>,Shape < _8,_2,  _2>>,
+                           Stride<Stride<_128,_1>,Stride<_16,_8,_512>>>;
+  // (T32,V16) -> (N8,K64)
+  using BLayout   = Layout<Shape <Shape < _4,_8>,Shape <_4,  _4>>,
+                           Stride<Stride<_32,_1>,Stride<_8,_128>>>;
+  // (T32,V4)  -> (M16,N8)
+  using CLayout   = SM120_16x8_Row;
+
+  // (T32, V32) -> (M16, K64) 
+  using ELayout   = Layout<Shape <Shape <_2,  _2,_8>, _32>,
+                           Stride<Stride<_8,_512,_1>,_16>>;
+};
+
+// sparse MXF8F6F4 with block-scaling.
+template <class a_type, class b_type, class c_type, class sf_type, int VS>
+struct MMA_Traits<SM120::BLOCKSCALED::SPARSE::SM120_SPARSE_16x8x64_TN_VS<a_type, b_type, c_type, sf_type, VS>>
+      : MMA_Traits<SM120::SPARSE::SM120_SPARSE_16x8x64_TN<a_type, b_type, c_type>>
+{
+  using ValTypeA = sparse_elem<2, a_type>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using FrgTypeA = sparse_elem<2, uint8_t>;
+  using FrgTypeE = sparse_elem<8, uint8_t>;
+
+  using ValTypeD = c_type;
+  using ValTypeC = c_type;
+
+  using ValTypeSF = sf_type;
+  constexpr static int SFVecSize = VS;
+
+  using UnderlyingSFTraits = MMA_Traits<SM120::BLOCKSCALED::SM120_16x8x64_TN_VS<a_type, b_type, c_type, sf_type, VS>>;
+  using SFALayout = typename UnderlyingSFTraits::SFALayout;
+  using SFBLayout = typename UnderlyingSFTraits::SFBLayout;
+};
+
+template <class a_type, class b_type, class c_type, class sf_type, int VS> 
+struct MMA_Traits<SM120::BLOCKSCALED::SPARSE::SM120_SPARSE_16x8x128_TN_VS<a_type, b_type, c_type, sf_type, VS>>
+{
+  using ValTypeA = sparse_elem<4,  uint8_t>;
+  using ValTypeE = sparse_elem<16, uint8_t>;
+  using ValTypeB = uint4_t;
+  using FrgTypeA = sparse_elem<4,  uint8_t>;
+  using FrgTypeE = sparse_elem<16, uint8_t>;
+
+  using ValTypeC = c_type;
+  using ValTypeD = c_type;
+
+  using ValTypeSF = sf_type;
+
+  constexpr static int SFVecSize = VS;
+
+  using Shape_MNK = Shape<_16, _8, _128>;
+  using ThrID     = Layout<_32>;
+  // (T32,V64) -> (M16,K128)
+  using ALayout   = Layout<Shape <Shape <  _4,_8>,Shape <_16,_2,   _2>>,
+                           Stride<Stride<_256,_1>,Stride<_16,_8,_1024>>>;
+  // (T32,V32) -> (N8,K128)
+  using BLayout   = Layout<Shape <Shape < _4,_8>,Shape <_8,  _4>>,
+                           Stride<Stride<_64,_1>,Stride<_8,_256>>>;
+  // (T32,V128) -> (M16,K128)
+  using SFALayout = Layout<Shape <Shape <_2,_2,_8>,_128>,
+                           Stride<Stride<_8,_0,_1>, _16>>;
+  // (T32,V128) -> (N8,K128)
+  using SFBLayout = Layout<Shape <Shape <_4,_8>,_128>,
+                           Stride<Stride<_0,_1>,  _8>>;
+  // (T32,V4)  -> (M16,N8)
+  using CLayout   = SM120_16x8_Row;
+  // (T32, V64) -> (M16, K128) 
+  using ELayout   = Layout<Shape <Shape <_2,   _2,_8>, Shape< _64>>,
+                           Stride<Stride<_8,_1024,_1>,Stride<_16>>>;
+};
+
+namespace SM120::SPARSE {
+
+// For SM120 MMA F8F6F4 input fp4, the operand A/B are load from ld.matrix. 
+// ld.matrix b4x16_p64 places FP4 data at the first four bits in each
+// eight-bit container, whereas MMA F8F6F4 expects the four-bit data to be in 
+// the middle of the eight-bit container. Thus, e2m1 operands being fed
+// to MMA F8F6F4 must be shifted left by two bits.
+// 0b0000ABCD --> 0b00ABCD00
+// NOTE: Same transformation is NOT needed for FP6 and FP8.
+template<class AType, class BType, class... MMAArgs, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_A(SM120_SPARSE_16x8x64_TN<AType, BType, MMAArgs ...> const&, Tensor&& tensor) {
+  using RegisterTypeA = typename remove_extent<typename
+                        SM120_SPARSE_16x8x64_TN<AType, BType, MMAArgs ...>::ARegisters>::type;
+  if constexpr (cute::is_same_v<AType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeA>(tensor), [](RegisterTypeA& v){ return v << 2; });
+  }
+}
+template<class AType, class BType, class... MMAArgs, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_B(SM120_SPARSE_16x8x64_TN<AType, BType, MMAArgs ...> const&, Tensor&& tensor) {
+  using RegisterTypeB = typename remove_extent<typename
+                        SM120_SPARSE_16x8x64_TN<AType, BType, MMAArgs ...>::BRegisters>::type;
+  if constexpr (cute::is_same_v<BType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeB>(tensor), [](RegisterTypeB& v){ return v << 2; });
+  }
+}
+
+} // end namespace SM120::SPARSE
+
+namespace SM120::BLOCKSCALED::SPARSE {
+
+// Template function with scale factor needs to enmuerate types one by one, as template 
+// arguments contatins two variadic lists, which cannot be deduced in one shot.
+template<class AType, class BType, class CType, class SFType, int VS, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_A(SM120_SPARSE_16x8x64_TN_VS<AType, BType, CType, SFType, VS> const&, Tensor&& tensor) {
+  using RegisterTypeA = typename remove_extent<typename
+                        SM120_SPARSE_16x8x64_TN_VS<AType, BType, CType, SFType, VS>::ARegisters>::type;
+  if constexpr (cute::is_same_v<AType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeA>(tensor), [](RegisterTypeA& v){ return v << 2; });
+  }
+}
+template<class AType, class BType, class CType, class SFType, int VS, class Tensor>
+CUTLASS_DEVICE void
+fp4_shift_B(SM120_SPARSE_16x8x64_TN_VS<AType, BType, CType, SFType, VS> const&, Tensor&& tensor) {
+  using RegisterTypeB = typename remove_extent<typename
+                        SM120_SPARSE_16x8x64_TN_VS<AType, BType, CType, SFType, VS>::BRegisters>::type;
+  if constexpr (cute::is_same_v<BType, cutlass::float_e2m1_t>) {
+    cute::transform(recast<RegisterTypeB>(tensor), [](RegisterTypeB& v){ return v << 2; });
+  }
+}
+
+} // end namespace SM120::BLOCKSCALED::SPARSE
+
+} // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp
index e3438f36..e688a7e6 100644
--- a/include/cute/atom/mma_traits_sm90_gmma.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -239,11 +239,10 @@ make_gmma_desc(Tensor<TEngine,TLayout> const& tensor)
                          "Not a canonical GMMA_MN Layout: Expected K-size 256/sizeof_bits<T> for dense or (128|512)/sizeof_bits<T> for sparse.");
 
     // Construct the canonical GMMA T Layout with shape ((W,n),(8,2))
-    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<Int<W>,_1>{}, Layout<Int<8>,_1>{}));
+    Layout canonical_layout = logical_divide(layout(u128_tensor), Tile<Layout<Int<W>,_1>,Layout<Int<8>,_1>>{});
 
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
+    // Check profile of canonical
+    CUTE_STATIC_ASSERT_V(congruent(canonical_layout, Shape<Shape<_1,_1>,Shape<_1,_1>>{}), "Not a canonical GMMA_MN Layout: Expected profile failure.");
     // Check canonical mode strides
     constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
     constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == LayoutType::INTERLEAVE ? stride<0,0>(canonical_layout) : 1;
@@ -274,11 +273,10 @@ make_gmma_desc(Tensor<TEngine,TLayout> const& tensor)
                          "Not a canonical GMMA_K Layout: Expected K-size 2 for dense or 4 for sparse (in units of uint128_t).");
 
     // Construct the canonical GMMA N Layout with shape ((8,n),(2,1))
-    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{}));
+    Layout canonical_layout = logical_divide(layout(u128_tensor), Tile<Layout<_8,_1>,Layout<_2,_1>>{});
 
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
+    // Check profile of canonical
+    CUTE_STATIC_ASSERT_V(congruent(canonical_layout, Shape<Shape<_1,_1>,Shape<_1,_1>>{}), "Not a canonical GMMA_K Layout: Expected profile failure.");
     // Check canonical mode strides
     constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
     constexpr uint32_t expected_stride_00 = W;
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
index 557e1103..f9d70004 100644
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@@ -34,6 +34,7 @@
 #include <cute/container/array.hpp>             // cute::array
 #include <cute/container/tuple.hpp>             // cute::is_tuple
 #include <cute/numeric/integral_constant.hpp>   // cute::Int
+#include <cute/numeric/integer_sequence.hpp>    // cute::seq
 #include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform
 
 /** IntTuple is an integer or a tuple of IntTuples.
@@ -349,7 +350,6 @@ ceil_div(IntTupleA const& a, IntTupleB const& b)
 //
 // round_up
 //   Round @a a up to the nearest multiple of @a b.
-//   For negative numbers, rounds away from zero.
 //
 
 template <class IntTupleA, class IntTupleB>
@@ -378,7 +378,7 @@ round_up(IntTupleA const& a, IntTupleB const& b)
  *   Return shape_div(a, product(b))
  * Case Int Int:
  *   Enforce the divisibility condition a % b == 0 || b % a == 0 when possible
- *   Return a / b with rounding away from 0 (that is, 1 or -1 when a < b)
+ *   Return ceil_div(a, b)
  */
 template <class IntTupleA, class IntTupleB>
 CUTE_HOST_DEVICE constexpr
@@ -399,32 +399,19 @@ shape_div(IntTupleA const& a, IntTupleB const& b)
   } else
   if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
     return shape_div(a, product(b));
-  } else
-  if constexpr (is_static<IntTupleA>::value && is_static<IntTupleB>::value) {
-    static_assert(IntTupleA::value % IntTupleB::value == 0 || IntTupleB::value % IntTupleA::value == 0, "Static shape_div failure");
-    return C<shape_div(IntTupleA::value, IntTupleB::value)>{};
-  } else {                                       // int int
-    //assert(a % b == 0 || b % a == 0);          // Waive dynamic assertion
-    return a / b != 0 ? a / b : signum(a) * signum(b);  // Division with rounding away from zero
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Minimum for Shapes
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-shape_min(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value || is_tuple<IntTupleB>::value) {
-    static_assert(dependent_false<IntTupleA>, "Not implemented.");
-  } else
-  if constexpr (is_constant<1, IntTupleA>::value || is_constant<1, IntTupleB>::value) {
-    return Int<1>{};            // _1 is less than all other shapes, preserve static
   } else {
-    return cute::min(a, b);
+    // Strong divisibility condition
+    //static_assert((IntTupleA::value % IntTupleB::value == 0) or (IntTupleB::value % IntTupleA::value == 0), "Divisibility Condition");
+
+    // Weak divisibility condition
+    if constexpr (is_static<IntTupleA>::value and is_static<IntTupleB>::value) {
+      static_assert(((IntTupleA::value % IntTupleB::value) == 0) or ((IntTupleB::value % IntTupleA::value) == 0), "Divisibility Condition");
+    } else {
+      // DEBUG assert can cause extra registers and inappropriate compile-time/run-time failure
+      //assert((((a % b) == 0) or ((a % b) == 0)) && "Divisibility Condition");
+    }
+
+    return (a + b - Int<1>{}) / b;
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -572,6 +559,72 @@ filter_zeros(Tuple const& t)
   return filter_zeros(t, t);
 }
 
+//
+// Static sorting utilities in detail::
+//
+
+namespace detail {
+
+// Some compilers fail to constexpr evaluate quick_sort
+// template <class T, size_t N>
+// constexpr cute::array<T,N> quick_sort(cute::array<T,N> a, int lo = 0, int hi = N-1) {
+//   if (hi <= lo) return;
+//   int p = lo;
+//   for (int i = lo; i < hi; ++i) {
+//     if (a[i] < a[hi]) {
+//       T tmp = a[p]; a[p] = a[i]; a[i] = tmp;
+//       ++p;
+//     }
+//   }
+//   T tmp = a[p]; a[p] = a[hi]; a[hi] = tmp;
+//   a = quick_sort(a, lo, p-1);
+//   a = quick_sort(a, p+1, hi);
+//   return a;
+// }
+
+template <class T, size_t N>
+constexpr cute::array<T,N> exchange_sort(cute::array<T,N> a) {
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = i+1; j < N; ++j) {
+      if (a[j] < a[i]) {
+        T tmp = a[j]; a[j] = a[i]; a[i] = tmp;
+      }
+    }
+  }
+  return a;
+}
+
+template <class V, class I = cute::make_int_sequence<cute::tuple_size_v<V>>>
+struct Sort : Sort<to_seq_t<V>, to_seq_t<I>> {};
+
+template <int... Vs, int... Is>
+struct Sort<seq<Vs...>, seq<Is...>> {
+  static_assert(sizeof...(Vs) == sizeof...(Is));
+  static constexpr cute::array<int,sizeof...(Is)> orig_array = {Vs...};
+  static constexpr cute::array<int,sizeof...(Is)> sort_array = exchange_sort(orig_array);
+  using type = seq<sort_array[Is]...>;
+};
+
+struct kvpair {
+  int key, val;
+  constexpr bool operator<(kvpair const& o) const { return key < o.key; };
+};
+
+template <class K, class V, class I = cute::make_int_sequence<cute::tuple_size_v<K>>>
+struct SortByKey : SortByKey<to_seq_t<K>, to_seq_t<V>, to_seq_t<I>> {};
+
+template <int... Ks, int... Vs, int... Is>
+struct SortByKey<seq<Ks...>, seq<Vs...>, seq<Is...>> {
+  static_assert(sizeof...(Ks) == sizeof...(Vs));
+  static_assert(sizeof...(Ks) == sizeof...(Is));
+  static constexpr cute::array<kvpair,sizeof...(Is)> orig_array = {kvpair{Ks,Vs}...};
+  static constexpr cute::array<kvpair,sizeof...(Is)> sort_array = exchange_sort(orig_array);
+  using key_type = seq<sort_array[Is].key...>;
+  using val_type = seq<sort_array[Is].val...>;
+};
+
+} // end namespace detail
+
 //
 // Converters and constructors with arrays and params
 //
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
index adf460bb..4ee901ad 100644
--- a/include/cute/layout.hpp
+++ b/include/cute/layout.hpp
@@ -627,27 +627,37 @@ depth(Layout<Shape,Stride> const& layout)
   return depth(shape<Is...>(layout));
 }
 
+// Return the coprofile of a mode as a tuple of _0s
+// @post congruent(coprofile(@a layout), @a layout(i)) for any i
+// @return T Tuple that is congruent with the codomain of @a a.
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coprofile(Layout<Shape,Stride> const& layout)
+{
+  return repeat_like(as_arithmetic_tuple(sum(stride<Is...>(layout))), Int<0>{});
+}
+
 // Return the codomain shape of a mode
-// @post size(coshape(@a a)) == cosize(@a a)
+// @post size(coshape(@a layout)) == cosize(@a layout)
 // @return C Coordinate with smallest elements such that
-//           @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
-//           where sub_layout = get<Is...>(layout).
+//           elem_less(@a sub_layout(c), C) for all c < size(@a sub_layout)
+//           where @a sub_layout = get<Is...>(layout).
 template <int... Is, class Shape, class Stride>
 CUTE_HOST_DEVICE constexpr
 auto
 coshape(Layout<Shape,Stride> const& layout)
 {
-  // Protect against negative strides
-  auto abs_sub_layout = make_layout(shape<Is...>(layout),
-                                    transform_leaf(stride<Is...>(layout), abs_fn{}));
-  auto co_coord = as_arithmetic_tuple(abs_sub_layout(size(abs_sub_layout) - Int<1>{}));
-  return co_coord + repeat_like(co_coord, Int<1>{});
+  auto m1_shapes   = transform_leaf( shape<Is...>(layout), [](auto s) { return s - Int<1>{}; });
+  auto abs_strides = transform_leaf(stride<Is...>(layout), abs_fn{});
+  auto co_coord    = as_arithmetic_tuple(inner_product(m1_shapes, abs_strides));
+  return transform_leaf(co_coord, [](auto c) { return c + Int<1>{}; });
 }
 
 // Return the codomain size of a mode
 // @return M smallest integer such that
-//           @a sub_layout(c) < M for all c < size(@a sub_layout)
-//           where sub_layout = get<Is...>(layout).
+//           size(@a sub_layout(c)) < M for all c < size(@a sub_layout)
+//           where @a sub_layout = get<Is...>(layout).
 template <int... Is, class Shape, class Stride>
 CUTE_HOST_DEVICE constexpr
 auto
@@ -1019,61 +1029,93 @@ auto
 composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
                  RShape const& rhs_shape, RStride const& rhs_stride)
 {
-  if constexpr (is_tuple<RShape>::value) {
-    // Apply the right-distributivity of Layout composition
+  if constexpr (is_tuple<RShape>::value) {                 // Right-distributivity of Layout composition for RHS tuple
     return transform_layout(rhs_shape, rhs_stride, [&](auto const& s, auto const& d) {
       return composition_impl(lhs_shape, lhs_stride, s, d);
     });
   } else
-  if constexpr (is_scaled_basis<RStride>::value) {
-    // Special case for a ScaledBasis stride
+  if constexpr (is_scaled_basis<RStride>::value) {         // Special case for a RHS ScaledBasis stride
     return composition_impl(basis_get(rhs_stride, lhs_shape), basis_get(rhs_stride, lhs_stride),
                             rhs_shape, basis_value(rhs_stride));
   } else
-  if constexpr (is_constant<0, RStride>::value) {
-    // Special case shortcut for any static stride-0
+  if constexpr (is_constant<0, RStride>::value) {          // Special case shortcut for any RHS static stride-0
     return Layout<RShape, RStride>{rhs_shape, rhs_stride};
   } else
-  if constexpr (is_integral<decltype(lhs_shape)>::value) {
-    // Special case shortcut for any integral LShape
+  if constexpr (is_integral<LShape>::value) {              // Special case shortcut for any LHS integral shape
     return Layout{rhs_shape, rhs_stride * lhs_stride};
-  } else
-  if constexpr (is_constant<1, RStride>::value) {
-    // Special case shortcut for any static stride-1
-    constexpr int R  = rank_v<LShape>;
-    auto result_shape_0  = take<0,R-1>(lhs_shape);
+  } else {                                                 // General case: LHS tuple, RHS integral
+    constexpr int R = tuple_size<LShape>::value;
 
-    // Mod out the rhs_shape from the lhs_shape
-    auto [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
-      [] (auto const& init, auto const& si) {
-        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
-      });
+    auto [result_shape, result_stride, rest_shape, rest_stride] =
+      cute::fold(make_seq<R-1>{},                           // t = [0,1,2,...,R-1)
+                 cute::make_tuple(cute::tuple<>{},          // v = (result_shape,
+                                  cute::tuple<>{},          //      result_stride,
+                                  rhs_shape,                //      rest_shape:Integral,
+                                  rhs_stride),              //      rest_stride:Integral)
+                 [&](auto const& init, auto curr_i) {       // f(v,t) -> v'
+                   // Can ICE on some compilers
+                   //auto [result_shape, result_stride, rest_shape, rest_stride] = init;
+                   //auto [curr_shape, curr_stride] = curr;
+                   // Unpack inputs
+                   auto result_shape  = get<0>(init);
+                   auto result_stride = get<1>(init);
+                   auto rest_shape    = get<2>(init);
+                   auto rest_stride   = get<3>(init);
 
-    // Jump into coalesce and append (rest_shape, get<R-1>(lhs_stride))
-    return detail::bw_coalesce<R-2>(result_shape_1, lhs_stride, rest_shape, get<R-1>(lhs_stride));
-  } else {
-    // General case: integral RShape and RStride, tuple LShape and LStride
-    constexpr int R  = rank_v<LShape>;
-    auto result_shape_0  = take<0,R-1>(lhs_shape);
-    auto result_stride_0 = take<0,R-1>(lhs_stride);
+                   auto curr_shape  = get<curr_i>(lhs_shape);
+                   auto curr_stride = get<curr_i>(lhs_stride);
 
-    // Divide out the rhs_stride from the lhs_shape
-    auto [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
-      [] (auto const& init, auto const& di) {
-        return cute::make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di));
-      });
+                   // Strong divisibility condition -- requires composition to be statically verifiable.
+                   //CUTE_STATIC_ASSERT_V(((rest_stride % curr_shape) == Int<0>{}) or (rest_stride < curr_shape), "Stride Divisibility Condition");
 
-    // Apply any lhs_shape changes to the stride
-    auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1));
+                   // Weak divisibility condition -- verify the divisibility condition whenever possible
+                   if constexpr (is_static<decltype(curr_shape)>::value and is_static<decltype(rest_stride)>::value) {
+                     CUTE_STATIC_ASSERT_V(((rest_stride % curr_shape) == Int<0>{}) or (rest_stride < curr_shape), "Stride Divisibility Condition");
+                   } else {
+                     // DEBUG assert can cause extra registers and inappropriate compile-time/run-time failure
+                     //assert((((rest_stride % curr_shape) == 0) or (rest_stride < curr_shape)) && "Stride Divisibility Condition");
+                   }
 
-    // Mod out the rhs_shape from the lhs_shape
-    auto [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
-      [] (auto const& init, auto const& si) {
-        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
-      });
+                   // next_shape:  ceil(exclusive_prefix_product<r>(lhs_shape) / rhs_stride)
+                   [[maybe_unused]] auto next_shape  = cute::ceil_div(curr_shape, abs(rest_stride));
+                   // next_stride: ceil(rhs_stride / exclusive_prefix_product<r>(lhs_shape))
+                   [[maybe_unused]] auto next_stride = cute::ceil_div(abs(rest_stride), curr_shape) * signum(rest_stride);
 
-    // Jump into coalesce and append (rest_shape, rest_stride * get<R-1>(lhs_stride))
-    return detail::bw_coalesce<R-2>(result_shape_2, result_stride_1, rest_shape, rest_stride * get<R-1>(lhs_stride));
+                   if constexpr (is_constant<1, decltype(next_shape)>::value or is_constant<1, decltype(rest_shape)>::value) {
+                     return cute::make_tuple(result_shape,
+                                             result_stride,
+                                             rest_shape,
+                                             next_stride);
+                   } else {
+                     auto new_shape = cute::min(next_shape, rest_shape);
+
+                     // Strong divisibility condition
+                     //CUTE_STATIC_ASSERT_V(((rest_shape % new_shape) == Int<0>{}), "Shape Divisibility Condition");
+
+                     // Weak divisibility condition
+                     if constexpr (is_static<decltype(new_shape)>::value and is_static<decltype(rest_shape)>::value) {
+                       CUTE_STATIC_ASSERT_V(((rest_shape % new_shape) == Int<0>{}), "Shape Divisibility Condition");
+                     } else {
+                       // DEBUG assert can cause extra registers and inappropriate compile-time/run-time failure
+                       //assert(((rest_shape % new_shape) == 0) && "Shape Divisibility Condition");
+                     }
+
+                     return cute::make_tuple(append(result_shape,  new_shape),
+                                             append(result_stride, rest_stride * curr_stride),
+                                             rest_shape / new_shape,
+                                             next_stride);
+                   }
+                 });
+
+    if constexpr (tuple_size<decltype(result_shape)>::value == 0) {
+      return Layout{rest_shape, rest_stride * get<R-1>(lhs_stride)};
+    } else
+    if constexpr (is_constant<1, decltype(rest_shape)>::value) {
+      return Layout{unwrap(result_shape), unwrap(result_stride)};
+    } else {
+      return Layout{append(result_shape,  rest_shape),
+                    append(result_stride, rest_stride * get<R-1>(lhs_stride))};
+    }
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -1088,8 +1130,7 @@ auto
 composition(Layout<LShape,LStride> const& lhs,
             Layout<RShape,RStride> const& rhs)
 {
-  auto coprofile = repeat_like(decltype(coshape(rhs)){}, Int<0>{});
-  auto flat_lhs = detail::coalesce_x(lhs, coprofile);
+  auto flat_lhs = detail::coalesce_x(lhs, coprofile(rhs));
   return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs.shape(), rhs.stride());
 }
 
@@ -1203,37 +1244,6 @@ complement(Layout<Shape,Stride> const& layout)
 // Right-Inverse and Left-Inverse
 //
 
-namespace detail {
-
-template <int NextStride, class Shape, class Stride, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-inverse_seq(Shape const& shape, Stride const& stride, seq<Is...>)
-{
-  auto next_I = cute::find_if(stride, [](auto a) { return is_constant<NextStride, decltype(a)>{}; });
-
-  if constexpr (next_I == decltype(rank(stride))::value) {
-    // If not found, return current seq
-    return seq<Is...>{};
-  } else {
-    // auto next_stride = get<next_I>(shape) * get<next_I>(stride);
-    // NOTE: Needed for g++-7
-    using next_stride = decltype(get<next_I>(shape) * get<next_I>(stride));
-
-    if constexpr (is_static<next_stride>::value && !is_constant<NextStride, next_stride>::value) {
-      // If next_stride is static and unique, then continue
-      return inverse_seq<next_stride::value>(shape, stride, seq<Is..., next_I>{});
-    } else {
-      // Else return current seq + next_I
-      return seq<Is..., next_I>{};
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
 //
 // Build the right-inverse of a layout
 // @pre is_static<Layout>
@@ -1248,22 +1258,40 @@ CUTE_HOST_DEVICE constexpr
 auto
 right_inverse(Layout<Shape,Stride> const& layout)
 {
-  auto flat_layout = coalesce(layout);
-  auto astride = transform_leaf(flat_layout.stride(), abs_fn{});
+  // Flatten and filter shape-1
+  auto clayout = coalesce(layout);
+  auto lstride = wrap(clayout.stride());
+  auto lshape  = wrap(clayout.shape());
 
-  // Find Int<1>{}, the starting stride, and follow the strides to gen inverse_seq
-  [[maybe_unused]] auto iseq = detail::inverse_seq<1>(flat_layout.shape(), astride, seq<>{});
+  // Prefix product of the shape
+  auto preprod_shape = cute::fold(lshape, cute::tuple<_1>{}, [](auto c, auto vi) { return append(c, vi*back(c)); });
 
-  if constexpr (iseq.size() == 0) {
-    return Layout<_1,_0>{};     // Empty case, nothing found
-  } else {
-    // Generate the corresponding new strides and construct
-    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
-    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
-                       unwrap(transform(iseq, [&](auto i) { return signum(stride<i>(flat_layout)) * get<i>(rstride); })));
-  }
+  // Filter out any dynamic strides
+  [[maybe_unused]] auto filtered_seq    = filter_tuple(make_seq<rank(lstride)>{}, lstride, [](auto i, auto d) {
+                                                         return conditional_return<is_static_v<decltype(d)>>(cute::tuple{i}, cute::tuple<>{}); });
+  [[maybe_unused]] auto filtered_stride = transform(filtered_seq, [&](auto i) { return get<i>(lstride); });
 
-  CUTE_GCC_UNREACHABLE;
+  // Sort by strides
+  using Sorted = detail::SortByKey<decltype(filtered_stride), decltype(filtered_seq)>;
+  auto sorted_seq = typename Sorted::val_type{};
+  //auto sorted_stride = typename Sorted::key_type{};
+
+  auto [result_shape, result_stride, curr] = cute::fold(sorted_seq, tuple<tuple<_1>,tuple<_0>,_1>{},
+    [&](auto const& init, auto i) {
+      [[maybe_unused]] auto ishape  = get<i>(lshape);
+      [[maybe_unused]] auto istride = get<i>(lstride);
+      [[maybe_unused]] auto curr_stride = get<2>(init);
+
+      if constexpr (is_constant<decltype(istride)::value, decltype(curr_stride)>::value) {
+        return make_tuple(append(get<0>(init),  ishape),                // result_shape
+                          append(get<1>(init), get<i>(preprod_shape)),  // result_stride
+                          ishape * istride);
+      } else {
+        return init;
+      }
+    });
+
+  return coalesce(make_layout(result_shape, result_stride));
 }
 
 CUTE_HOST_DEVICE constexpr
@@ -1274,13 +1302,12 @@ right_inverse(Underscore const& _)
 }
 
 //
-// Build the left-inverse of a layout
+// Build the quasi-inverse of a layout (left-inverse when layout is injective)
 // @pre is_static<Layout>
-// @pre @a layout is an injective function
 // @result A layout @a result such that
-//    @a result(@a layout(i)) == i for all i < size(@a layout)
+//    @a layout(@a result(@a layout(i))) == @a layout(i) for all i < size(@a layout)
 // @result A layout @a result such that
-//    composition(@a result, @a layout) is identical to make_layout(shape(layout))
+//    composition(@layout, composition(@a result, @a layout)) is identical to @a layout
 //
 
 template <class Shape, class Stride>
@@ -1288,7 +1315,39 @@ CUTE_HOST_DEVICE constexpr
 auto
 left_inverse(Layout<Shape,Stride> const& layout)
 {
-  return right_inverse(make_layout(layout, complement(layout)));
+  // Flatten and filter shape-1
+  auto clayout = coalesce(layout);
+  auto lstride = wrap(clayout.stride());
+  auto lshape  = wrap(clayout.shape());
+
+  // Prefix product of the shape
+  auto preprod_shape = cute::fold(lshape, cute::tuple<_1>{}, [](auto c, auto vi) { return append(c, vi*back(c)); });
+
+  // Sort by strides
+  static_assert(is_static<decltype(lstride)>::value, "Left inverse requires static strides.");
+  using Sorted = detail::SortByKey<decltype(lstride), tuple_seq<decltype(lstride)>>;
+  auto sorted_seq = typename Sorted::val_type{};
+  //auto sorted_stride = typename Sorted::key_type{};
+
+  auto [result_shape, result_stride] = cute::fold(sorted_seq, tuple<tuple<>,tuple<_0>>{},
+    [&](auto const& init, auto i) {
+      [[maybe_unused]] auto istride = get<i>(lstride);
+
+      if constexpr (is_constant<0, decltype(istride)>::value) {
+        return init;
+      } else {
+        auto result_shape  = get<0>(init);
+        auto result_stride = get<1>(init);
+
+        CUTE_STATIC_ASSERT_V((istride % size(result_shape)) == Int<0>{}, "Left inverse divisibility condition");
+
+        return make_tuple(append(result_shape,  istride / size(result_shape)),
+                          append(result_stride, get<i>(preprod_shape)));
+      }
+    });
+
+  return coalesce(make_layout(append(result_shape, get<back(sorted_seq)>(lshape)),
+                              result_stride));
 }
 
 CUTE_HOST_DEVICE constexpr
@@ -1506,7 +1565,7 @@ auto
 logical_divide(Layout<LShape,LStride> const& layout,
                Layout<TShape,TStride> const& tiler)
 {
-  return composition(layout, make_layout(tiler, complement(tiler, shape(layout))));
+  return composition(layout, make_layout(tiler, complement(tiler, shape(coalesce(layout)))));
 }
 
 template <class LShape, class LStride, class Tiler>
@@ -1760,10 +1819,11 @@ upcast(Shape const& shape, Stride const& stride)
   } else if constexpr (is_constant<0, Stride>::value) {    // static-0 stride
     return Layout<Shape,Stride>{shape,stride};
   } else if constexpr (is_static<Stride>::value) {         // static stride
-    return make_layout(shape_div(shape,  shape_div(Int<N>{}, abs(stride))),
-                       shape_div(stride, Int<N>{}));
+    static_assert(Stride::value % N == 0 or N % Stride::value == 0, "Divisibility condition");
+    return make_layout(ceil_div(shape,  ceil_div(Int<N>{}, abs(stride))),
+                       signum(stride) * ceil_div(abs(stride), Int<N>{}));
   } else {                                                 // dynamic stride
-    // assume dynamic strides are larger than N and divisible
+    // Assume dynamic strides are larger than N and divisible
     // assert(stride % N == 0);
     return make_layout(shape, safe_div(stride, Int<N>{}));
   }
diff --git a/include/cute/layout_composed.hpp b/include/cute/layout_composed.hpp
index fc26fbb3..6a967783 100644
--- a/include/cute/layout_composed.hpp
+++ b/include/cute/layout_composed.hpp
@@ -37,7 +37,7 @@
 /* This implements a ComposedLayout of the form
  *   LayoutA o Offset o LayoutB
  * and is useful in cases where composition() does not or cannot apply to LayoutA and LayoutB.
- * For example, when the "divisibility condition" in shape_div is violated in composition(LayoutA, LayoutB).
+ * For example, when the "divisibility condition" is violated in composition(LayoutA, LayoutB).
  *
  * This ComposedLayout provides similar functionality to Layout including tiling, partitioning,
  * coordinate-to-index mapping and layout manipulations, but is not considered a "normal" layout.
diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp
index 3c2c23cc..33076378 100644
--- a/include/cute/numeric/arithmetic_tuple.hpp
+++ b/include/cute/numeric/arithmetic_tuple.hpp
@@ -370,12 +370,21 @@ safe_div(ScaledBasis<T,M> const& b, U const& u)
 template <class T, int M, class U>
 CUTE_HOST_DEVICE constexpr
 auto
-shape_div(ScaledBasis<T,M> const& b, U const& u)
+ceil_div(ScaledBasis<T,M> const& b, U const& u)
 {
-  auto t = shape_div(b.value(), u);
+  auto t = ceil_div(b.value(), u);
   return ScaledBasis<decltype(t),M>{t};
 }
 
+template <class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+abs(ScaledBasis<T,N> const& e)
+{
+  auto t = abs(e.value());
+  return ScaledBasis<decltype(t),N>{t};
+}
+
 // Equality
 template <class T, int N, class U, int M>
 CUTE_HOST_DEVICE constexpr
@@ -399,14 +408,6 @@ operator==(T const&, ScaledBasis<U,M> const&) {
   return {};
 }
 
-// Abs
-template <class T, int N>
-CUTE_HOST_DEVICE constexpr
-auto
-abs(ScaledBasis<T,N> const& e) {
-  return ScaledBasis<decltype(abs(e.value())),N>{abs(e.value())};
-}
-
 // Multiplication
 template <class A, class T, int N>
 CUTE_HOST_DEVICE constexpr
diff --git a/include/cute/numeric/integer_sequence.hpp b/include/cute/numeric/integer_sequence.hpp
index 4118d9cb..799e1896 100644
--- a/include/cute/numeric/integer_sequence.hpp
+++ b/include/cute/numeric/integer_sequence.hpp
@@ -124,6 +124,31 @@ using tuple_seq = make_seq<tuple_size<remove_cvref_t<Tuple>>::value>;
 template <class Tuple>
 using tuple_rseq = make_rseq<tuple_size<remove_cvref_t<Tuple>>::value>;
 
+//
+// Convert a parameter pack to an int sequence
+//
+
+template <class T>
+struct to_seq;
+
+template <>
+struct to_seq<integer_sequence<int>> {
+  using type = seq<>;
+};
+
+template <int I, int... Is>
+struct to_seq<integer_sequence<int, I, Is...>> {
+  using type = seq<I, Is...>;
+};
+
+template <template <class...> class TupleLike, class... Ts>
+struct to_seq<TupleLike<Ts...>> {
+  using type = seq<Ts::value...>;
+};
+
+template <class T>
+using to_seq_t = typename to_seq<T>::type;
+
 //
 // Specialize cute::tuple-traits for std::integer_sequence
 //
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
index 349ad3ed..1d33361f 100644
--- a/include/cute/numeric/integral_constant.hpp
+++ b/include/cute/numeric/integral_constant.hpp
@@ -74,29 +74,40 @@ struct integral_constant : C<v> {
 
 // Use cute::is_std_integral<T> to match built-in integral types (int, int64_t, unsigned, etc)
 // Use cute::is_integral<T> to match both built-in integral types AND static integral types.
-
 template <class T>
 struct is_integral : bool_constant<is_std_integral<T>::value> {};
 template <auto v>
 struct is_integral<C<v>                  > : true_type {};
 template <class T, T v>
 struct is_integral<integral_constant<T,v>> : true_type {};
+template <class T>
+constexpr bool is_integral_v = is_integral<T>::value;
 
-// Register FastDivmod as the integral type
+// Register FastDivmod as integral type
 template<>
 struct is_integral<cutlass::FastDivmod> : true_type {};
 
 // is_static detects if an (abstract) value is defined completely by its type (no members)
 template <class T>
-struct is_static : bool_constant<is_empty<remove_cvref_t<T>>::value> {};
-
+struct is_static : bool_constant<is_empty<T>::value> {};
+template <class T>
+struct is_static<T const > : is_static<T> {};
+template <class T>
+struct is_static<T const&> : is_static<T> {};
+template <class T>
+struct is_static<T      &> : is_static<T> {};
+template <class T>
+struct is_static<T     &&> : is_static<T> {};
 template <class T>
 constexpr bool is_static_v = is_static<T>::value;
 
 // is_constant detects if a type is a static integral type and if v is equal to a value
-
 template <auto n, class T>
 struct is_constant : false_type {};
+template <auto n, auto v>
+struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
+template <auto n, class T, T v>
+struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
 template <auto n, class T>
 struct is_constant<n, T const > : is_constant<n,T> {};
 template <auto n, class T>
@@ -105,10 +116,8 @@ template <auto n, class T>
 struct is_constant<n, T      &> : is_constant<n,T> {};
 template <auto n, class T>
 struct is_constant<n, T     &&> : is_constant<n,T> {};
-template <auto n, auto v>
-struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
-template <auto n, class T, T v>
-struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
+template <auto n, class T>
+constexpr bool is_constant_v = is_constant<n,T>::value;
 
 //
 // Specializations
diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp
index 61a91d92..43d3c4b2 100644
--- a/include/cute/swizzle_layout.hpp
+++ b/include/cute/swizzle_layout.hpp
@@ -573,8 +573,8 @@ logical_product(Layout<Shape,Stride>                          const& layout,
   auto active_Y = swizzle_active_bits & typename Swizzle<B,M,S>::yyy_msk{};
 
   // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
-  auto new_active_Z = new_layout(Int<0>{}, tiler.layout_b()[active_Z]);
-  auto new_active_Y = new_layout(Int<0>{}, tiler.layout_b()[active_Y]);
+  auto new_active_Z = new_layout(Int<0>{}, tiler.layout_b()(active_Z));
+  auto new_active_Y = new_layout(Int<0>{}, tiler.layout_b()(active_Y));
 
   // Use this new swizzle identifier to construxt the new swizzle for new_layout
   //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp
index be22ab37..9c1a0b44 100644
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@@ -481,7 +481,7 @@ CUTE_HOST_DEVICE constexpr
 auto
 make_counting_tensor(Layout const& layout)
 {
-  return make_tensor(make_inttuple_iter(repeat_like(coshape(layout), Int<0>{})), layout);
+  return make_tensor(make_inttuple_iter(coprofile(layout)), layout);
 }
 
 //
@@ -788,7 +788,7 @@ recast(Tensor&& tensor)
  * vectorization should be attempted.
  *
  * Note that the return value does NOT include alignment concerns such as the pointer value and
- * the divisbility of dynamic strides.
+ * the divisibility of dynamic strides.
  */
 template <class SrcEngine, class SrcLayout,
           class DstEngine, class DstLayout>
@@ -828,7 +828,7 @@ max_common_vector(Tensor<SrcEngine,SrcLayout> const& a,
  *          are both identity Layouts.
  *
  * Note that the returned layout does NOT include alignment concerns such as the pointer value and
- * the divisbility of dynamic strides.
+ * the divisibility of dynamic strides.
  */
 template <class SrcEngine, class SrcLayout,
           class DstEngine, class DstLayout>
diff --git a/include/cute/util/type_traits.hpp b/include/cute/util/type_traits.hpp
index 6ab1d471..ee361c7d 100644
--- a/include/cute/util/type_traits.hpp
+++ b/include/cute/util/type_traits.hpp
@@ -154,20 +154,23 @@ using CUTE_STL_NAMESPACE::is_pointer_v;
 using CUTE_STL_NAMESPACE::declval;
 
 template <class T>
-constexpr T&& forward(remove_reference_t<T>& t) noexcept
+CUTE_HOST_DEVICE constexpr
+T&& forward(remove_reference_t<T>& t) noexcept
 {
   return static_cast<T&&>(t);
 }
 
 template <class T>
-constexpr T&& forward(remove_reference_t<T>&& t) noexcept
+CUTE_HOST_DEVICE constexpr
+T&& forward(remove_reference_t<T>&& t) noexcept
 {
   static_assert(! is_lvalue_reference_v<T>, "T cannot be an lvalue reference (e.g., U&).");
   return static_cast<T&&>(t);
 }
 
 template <class T>
-constexpr remove_reference_t<T>&& move(T&& t) noexcept
+CUTE_HOST_DEVICE constexpr
+remove_reference_t<T>&& move(T&& t) noexcept
 {
   return static_cast<remove_reference_t<T>&&>(t);
 }
@@ -220,8 +223,6 @@ struct tuple_size;
 template <class T>
 struct tuple_size<T,void_t<typename CUTE_STL_NAMESPACE::tuple_size<T>::type>> : CUTE_STL_NAMESPACE::integral_constant<size_t, CUTE_STL_NAMESPACE::tuple_size<T>::value> {};
 
-// S =  : std::integral_constant<std::size_t, std::tuple_size<T>::value> {};
-
 template <class T>
 constexpr size_t tuple_size_v = tuple_size<T>::value;
 
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index f534f6cd..c634e884 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -42,6 +42,8 @@ namespace cutlass {
 namespace arch {
 
 constexpr int sm100_smem_capacity_bytes = 232448;  
+constexpr int sm120_smem_capacity_bytes = 102400;
+
 #if defined(__NVCC__) || defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
 
 /// Computes laneId within a warp
@@ -99,6 +101,13 @@ struct Sm100 {
   static int const kMinComputeCapability = 100; 
 };
 
+struct Sm101 {
+  static int const kMinComputeCapability = 101; 
+};
+
+struct Sm120 {
+  static int const kMinComputeCapability = 120;
+};
 
 /// Triggers a breakpoint on the device
 CUTLASS_DEVICE
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index b9bb70f9..ef0d3dde 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -46,7 +46,7 @@
 #endif
 
 
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #define CUTLASS_ARCH_TCGEN_ENABLED 1
 #endif
 
@@ -167,7 +167,8 @@ enum class ReservedNamedBarriers {
   StreamkBarrier0 = 4,
   StreamkBarrier1 = 5
   , TmemAllocBarrier = 6 
-  , FirstUserBarrier = StreamkBarrier1 + 1
+  , Sm120MainloopBarrier = 7
+  , FirstUserBarrier = Sm120MainloopBarrier + 1
 };
 
 
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
index 8dea5800..1dd27f78 100644
--- a/include/cutlass/arch/config.h
+++ b/include/cutlass/arch/config.h
@@ -57,28 +57,17 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-// SM90 Modifiable TMA
+// Modifiable TMA
+// tensormap.replace is arch conditional
 #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 3))
   #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED 1
-  #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900)
+  #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED) && \
+    (defined(__CUDA_ARCH_FEAT_SM90_ALL)  || defined(__CUDA_ARCH_FEAT_SM100_ALL) || \
+     defined(__CUDA_ARCH_FEAT_SM101_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL)))
     #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED 1
   #endif
 #endif
 
-#if (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ == 8)
-  #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED)
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900 && \
-        !defined(__CUDA_ARCH_FEAT_SM90_ALL)
-      #undef CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED
-    #endif
-    
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && \
-        !defined(__CUDA_ARCH_FEAT_SM100_ALL)
-      #undef CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED
-    #endif
-    
-  #endif
-#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -110,7 +99,37 @@
 
 
 
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
+// SM101 and SM101a
+#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  #define CUTLASS_ARCH_MMA_SM101_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM101_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1010)
+    #define CUTLASS_ARCH_MMA_SM101_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM101_ALL))
+      #define CUTLASS_ARCH_MMA_SM101A_ENABLED 1
+    #endif
+
+  #endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM120 and SM120a
+#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  #define CUTLASS_ARCH_MMA_SM120_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM120_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200)
+    #define CUTLASS_ARCH_MMA_SM120_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM120_ALL))
+      #define CUTLASS_ARCH_MMA_SM120A_ENABLED 1
+    #endif
+
+  #endif
+#endif
+
+
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUTLASS_ARCH_CLC_ENABLED
 #endif
 
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index fb8c7440..40c8200f 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -133,6 +133,8 @@ struct OpClassSparseTensorOp {};
 /// Tag classifying operators as Tensor Core with blockScaled
 struct OpClassBlockScaledTensorOp {};
 
+/// Tag classifying operators as Tensor Core with blockScaled structured sparse operations.
+struct OpClassBlockScaledSparseTensorOp {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
index 707e1d75..557643e5 100644
--- a/include/cutlass/arch/reg_reconfig.h
+++ b/include/cutlass/arch/reg_reconfig.h
@@ -41,11 +41,12 @@
   #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (             \
          (__CUDA_ARCH__ ==  900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))      \
       || (__CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))     \
+      || (__CUDA_ARCH__ == 1010 && defined(__CUDA_ARCH_FEAT_SM101_ALL))     \
+      || (__CUDA_ARCH__ == 1200 && defined(__CUDA_ARCH_FEAT_SM120_ALL))     \
     )
     #define CUDA_CTA_RECONFIG_ACTIVATED 1
   #endif
 
-
 #endif
 
 namespace cutlass {
diff --git a/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp b/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
index 6486e243..dc75b988 100644
--- a/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
+++ b/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
@@ -538,7 +538,7 @@ public:
       const auto & input_stride  = problem_shape.stride_A;
 
       implementable &= input_stride[ProblemShape::RankT - 1] == 1;
-      int input_shape_size = 1;
+      int64_t input_shape_size = 1;
       for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
         input_shape_size *= input_shape[i + 1];
         implementable &= input_stride[i] == input_shape_size;
@@ -548,7 +548,7 @@ public:
       const auto & output_stride  = problem_shape.stride_C;
 
       implementable &= output_stride[ProblemShape::RankT - 1] == 1;
-      int output_shape_size = 1;
+      int64_t output_shape_size = 1;
       for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
         output_shape_size *= output_shape[i + 1];
         implementable &= output_stride[i] == output_shape_size;
@@ -625,6 +625,25 @@ public:
       }
     }
 
+    // The extents of linearized problem shape should be int32_t type(maximum is 2^31-1).
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+
     return true;
   }
 
diff --git a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
index 27eed799..be490f97 100644
--- a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -414,7 +414,7 @@ public:
       const auto & input_stride  = problem_shape.stride_A;
 
       implementable &= input_stride[ProblemShape::RankT - 1] == 1;
-      int input_shape_size = 1;
+      int64_t input_shape_size = 1;
       for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
         input_shape_size *= input_shape[i + 1];
         implementable &= input_stride[i] == input_shape_size;
@@ -443,7 +443,7 @@ public:
       const auto & output_stride  = problem_shape.stride_C;
 
       implementable &= output_stride[ProblemShape::RankT - 1] == 1;
-      int output_shape_size = 1;
+      int64_t output_shape_size = 1;
       for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
         output_shape_size *= output_shape[i + 1];
         implementable &= output_stride[i] == output_shape_size;
@@ -482,6 +482,24 @@ public:
       return false;
     }
 
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+
     return true;
   }
 
diff --git a/include/cutlass/detail/collective.hpp b/include/cutlass/detail/collective.hpp
index 4938767d..01085c54 100644
--- a/include/cutlass/detail/collective.hpp
+++ b/include/cutlass/detail/collective.hpp
@@ -122,6 +122,13 @@ is_sm10x_f8f6f4_element() {
 }
 
 
+template <class Element>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_f4_element() {
+  return (cute::is_same_v<Element, cute::float_e2m1_t> 
+  );
+}
 
 template <class ElementType>
 CUTLASS_HOST_DEVICE
@@ -163,6 +170,20 @@ struct sm10x_block_scale_runtime_input_t {
 };
 
 
+template <class TiledMma, class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm120_f8f6f4() {
+  return (cute::size<2>(typename TiledMma::Shape_MNK{}) == 32) && is_sm10x_f8f6f4_inputs<ElementA, ElementB>();
+}
+
+template <class TiledMma, class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm100_sparse_f8f6f4() {
+  return (cute::size<2>(typename TiledMma::Shape_MNK{}) == 64) && is_sm10x_f8f6f4_inputs<ElementA, ElementB>();
+}
+
 } // namespace detail
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 4cb02533..44a83b9d 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -196,7 +196,9 @@ namespace cutlass {
 #  define CUTLASS_CPLUSPLUS __cplusplus
 #endif
 
-#if (201700L <= CUTLASS_CPLUSPLUS)
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/n4762.pdf
+// Section 14.8 Predefined macro names
+#if (201703L <= CUTLASS_CPLUSPLUS)
 #define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
 #define CUTLASS_CXX17_OR_LATER 1
 #else
diff --git a/include/cutlass/detail/layout.hpp b/include/cutlass/detail/layout.hpp
index fc9dd15d..a0a183b0 100644
--- a/include/cutlass/detail/layout.hpp
+++ b/include/cutlass/detail/layout.hpp
@@ -341,7 +341,6 @@ get_alignment_count_from_gmem_tiled_copy() {
   else {
     // For TMA tiled copies, we know the alignment has to be 128 bits
     if constexpr (is_tma_copy_engine<GmemTiledCopy>()) {
-      
       if constexpr ( cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e2m1_unpacksmem_t> ||
                      cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e3m2_unpacksmem_t> ||
                      cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e2m3_unpacksmem_t> ||
@@ -371,7 +370,6 @@ template <
 >
 constexpr int
 get_input_alignment_bits() {
-  
   if constexpr (IsF8F6F4SubBytes && sizeof_bits<ElementType>::value == 4) {
     return 64 * 8;
   }
diff --git a/include/cutlass/detail/sm100_blockscaled_layout.hpp b/include/cutlass/detail/sm100_blockscaled_layout.hpp
index cba49d64..e4f20cb2 100644
--- a/include/cutlass/detail/sm100_blockscaled_layout.hpp
+++ b/include/cutlass/detail/sm100_blockscaled_layout.hpp
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-
-
 /*! \file
     \brief Blocked Scale configs specific for SM100 BlockScaled MMA
 */
@@ -42,13 +40,13 @@
 #include "cute/int_tuple.hpp"
 #include "cute/atom/mma_traits_sm100.hpp"
 
-namespace cutlass::detail{
+namespace cutlass::detail {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 using namespace cute;
 
 template<int SFVecSize, UMMA::Major major = UMMA::Major::K>
-struct Sm100BlockScaledBasicChunk {
+struct Sm1xxBlockScaledBasicChunk {
 
   using Blk_MN    = _128;
   using Blk_SF    =   _4; 
@@ -61,14 +59,14 @@ struct Sm100BlockScaledBasicChunk {
 };
 
 template<int SFVecSize_>
-struct Sm100BlockScaledConfig {
+struct Sm1xxBlockScaledConfig {
   // We are creating the SFA and SFB tensors' layouts in the collective since they always have the same layout.
   // k-major order
   static constexpr int SFVecSize = SFVecSize_;
-  using Sm100BlkScaledChunk = Sm100BlockScaledBasicChunk<SFVecSize>;
-  using Blk_MN = typename Sm100BlkScaledChunk::Blk_MN;
-  using Blk_SF = typename Sm100BlkScaledChunk::Blk_SF; 
-  using SfAtom = typename Sm100BlkScaledChunk::SfAtom;
+  using Sm1xxBlkScaledChunk = Sm1xxBlockScaledBasicChunk<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledChunk::Blk_MN;
+  using Blk_SF = typename Sm1xxBlkScaledChunk::Blk_SF; 
+  using SfAtom = typename Sm1xxBlkScaledChunk::SfAtom;
 
   using LayoutSF = decltype(blocked_product(SfAtom{}, make_layout( make_shape(int32_t(0), int32_t(0), int32_t(0)),
                                                                   make_stride(int32_t(0),       _1{}, int32_t(0)))));
@@ -125,8 +123,8 @@ struct Sm100BlockScaledConfig {
                                                                           cute::size<2>(TileShape_MNK{}))));
     // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
     // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
-    using Blk_MN    = typename Sm100BlkScaledChunk::Blk_MN;
-    using Blk_SF    = typename Sm100BlkScaledChunk::Blk_SF; 
+    using Blk_MN    = typename Sm1xxBlkScaledChunk::Blk_MN;
+    using Blk_SF    = typename Sm1xxBlkScaledChunk::Blk_SF; 
     using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
 
     using TL_VMNK = typename TiledMma::ThrLayoutVMNK;
@@ -160,8 +158,8 @@ struct Sm100BlockScaledConfig {
                                                                           cute::size<2>(TileShape_MNK{}))));
     // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
     // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
-    using Blk_MN    = typename Sm100BlkScaledChunk::Blk_MN;
-    using Blk_SF    = typename Sm100BlkScaledChunk::Blk_SF; 
+    using Blk_MN    = typename Sm1xxBlkScaledChunk::Blk_MN;
+    using Blk_SF    = typename Sm1xxBlkScaledChunk::Blk_SF; 
     using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
 
     using TL_VMNK = typename TiledMma::ThrLayoutVMNK;
@@ -181,20 +179,23 @@ struct Sm100BlockScaledConfig {
 
 
 template<int SFVecSize_, UMMA::Major major = UMMA::Major::K>
-struct Sm100BlockScaledOutputConfig {
+struct Sm1xxBlockScaledOutputConfig {
   // We are creating the SFD tensors' layouts in the collective.
   // k-major order
   static constexpr int SFVecSize = SFVecSize_;
-  using Sm100BlkScaledChunk = cutlass::detail::Sm100BlockScaledBasicChunk<SFVecSize, major>;
-  using Blk_MN = typename Sm100BlkScaledChunk::Blk_MN;
-  using Blk_SF = typename Sm100BlkScaledChunk::Blk_SF; 
-  using SfAtom = typename Sm100BlkScaledChunk::SfAtom;
+  using Sm1xxBlkScaledChunk = cutlass::detail::Sm1xxBlockScaledBasicChunk<SFVecSize, major>;
+  using Blk_MN = typename Sm1xxBlkScaledChunk::Blk_MN;
+  using Blk_SF = typename Sm1xxBlkScaledChunk::Blk_SF; 
+  using SfAtom = typename Sm1xxBlkScaledChunk::SfAtom;
 
   using LayoutKMajorSF  = decltype(blocked_product(SfAtom{}, make_layout(make_shape (int32_t(0), int32_t(0), int32_t(0)),
                                                                          make_stride(int32_t(0),       _1{}, int32_t(0)))));
 
-  static_assert(major == UMMA::Major::K, "Only K-major scalefactor output is supported");
-  using LayoutSF = LayoutKMajorSF;
+  using LayoutMNMajorSF = decltype(blocked_product(SfAtom{}, make_layout(make_shape (int32_t(0), int32_t(0), int32_t(0)),
+                                                                         make_stride(      _1{}, int32_t(0), int32_t(0)))));
+
+  using LayoutSF = cute::conditional_t<major == UMMA::Major::K, LayoutKMajorSF, LayoutMNMajorSF>;
+
   CUTE_HOST_DEVICE
   static constexpr auto
   deduce_layoutSFD() {
@@ -208,12 +209,17 @@ struct Sm100BlockScaledOutputConfig {
   tile_atom_to_shape_SFD(ProblemShape problem_shape, LayoutSFD layout_sfc = LayoutSFD{}) {
     auto problem_shape_MNKL = append<4>(problem_shape, 1);
     auto [M, N, K, L] = problem_shape_MNKL;
-    return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_2,_1,_3>{});
+    if constexpr (major == UMMA::Major::K) {
+      return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_2,_1,_3>{});
+    } 
+    else { 
+      return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_1,_2,_3>{});
+    }
   }
 };
 
 //// Describe the Scalefactor Tensor without VectorSize
-struct Sm100BlockScaledTensorConfig {
+struct Sm1xxBlockScaledTensorConfig {
   // k-major order
   // The blockscaled tensor does not need to know vectorsize
   using Blk_M = _128;
diff --git a/include/cutlass/epilogue/collective/builders/sm100_builder.inl b/include/cutlass/epilogue/collective/builders/sm100_builder.inl
index 82c4b906..16eb4fc9 100644
--- a/include/cutlass/epilogue/collective/builders/sm100_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm100_builder.inl
@@ -29,9 +29,6 @@
  *
  **************************************************************************************************/
 #pragma once
-//
-
-//
 
 #include "cute/layout.hpp"     // cute::Shape
 #include "cute/numeric/numeric_types.hpp" // cute::sizeof_bits_v
@@ -98,6 +95,485 @@ sm100_get_epilogue_smem_swizzle_layout_atom() {
   }
 }
 
+namespace sparse {
+
+template <
+  class CtaTileShape_MNK,
+  class EpilogueTileType,
+  class TmemWarpShape_MN,
+  class ElementC,
+  class StrideC,
+  class ElementD,
+  class StrideD,
+  class EpilogueScheduleType,
+  class FusionOp
+>
+constexpr auto
+sm100_sparse_compute_tile_shape_or_override() {
+  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+    constexpr int CtaM = size<0>(CtaTileShape_MNK{});
+    constexpr int CtaN = size<1>(CtaTileShape_MNK{});
+    constexpr int CtaK = size<2>(CtaTileShape_MNK{});
+    constexpr int WarpM = size<0>(TmemWarpShape_MN{});
+    constexpr int WarpN = size<1>(TmemWarpShape_MN{});
+    constexpr bool DisableSource = cute::is_void_v<ElementC>;
+
+    // For SM100 SP BSSP kernel, we always have EpiTileM = CtaM
+    constexpr int EpiTileM = CtaM;
+
+    constexpr bool Is1Sm = cute::is_base_of_v<TmaWarpSpecialized1Sm, EpilogueScheduleType>;
+    constexpr bool Is2Sm = cute::is_base_of_v<TmaWarpSpecialized2Sm, EpilogueScheduleType>;
+
+    constexpr bool IsBsspMxf8f6f4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4, EpilogueScheduleType> ||
+                                    cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4, EpilogueScheduleType>;
+    constexpr bool IsBsspNvf4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmNvf4, EpilogueScheduleType> ||
+                                cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmNvf4, EpilogueScheduleType>;
+    constexpr bool IsBsspMxf4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmMxf4, EpilogueScheduleType> ||
+                                cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmMxf4, EpilogueScheduleType>;
+    constexpr bool IsBssp = (IsBsspMxf8f6f4 || IsBsspNvf4 || IsBsspMxf4);
+    constexpr bool IsSp = not IsBssp;
+
+    constexpr auto compute_epi_tile_n = [&](int epi_smem_size_kb, int num_epi_stage, int element_bit_size) constexpr -> int {
+      // Use Epi Smem + Num Epi Stage to compute Epi Tile N
+      return cutlass::bytes_to_bits(epi_smem_size_kb * 1024) / num_epi_stage / EpiTileM / element_bit_size;
+    };
+
+    // Row major SFD, EpiTileN = SFD_VS multiplier
+    constexpr bool is_sfd_row_major = (not cute::is_void_v<typename FusionOp::GmemLayoutTagScalefactor>) && 
+                                      cute::is_same_v<typename FusionOp::GmemLayoutTagScalefactor, cutlass::layout::RowMajor>;
+    constexpr bool is_sfd_row_major_vs64 = is_sfd_row_major ? (FusionOp::SFVecSize == 64) : false;
+
+    constexpr auto EpiTileN = [&]() constexpr -> int {
+      // VoidC Kernel
+      if (DisableSource) {
+        auto d_bits = cute::sizeof_bits_v<ElementD>;
+        if (IsSp) {
+          if (d_bits == 32) {
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 128) {
+              bool Is4KBlock = (CtaK == 64 || CtaK == 256);
+              if (Is4KBlock)  { return compute_epi_tile_n(16, 2, d_bits); }
+                                return compute_epi_tile_n(32, 2, d_bits);
+            }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 64 || CtaK == 256);
+              if (Is4KBlock)  { return compute_epi_tile_n(16, 2, d_bits); }
+                                return compute_epi_tile_n(32, 2, d_bits);
+            }
+          }
+          if (d_bits == 16) {
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is2Sm && CtaN == 128) {
+              // Prioritize Mxf8f6f4 kernel
+              bool Is4KBlock = (CtaK == 256);
+              if (Is4KBlock)  { return compute_epi_tile_n(16, 2, d_bits); }
+                                return compute_epi_tile_n(32, 2, d_bits);
+            }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) {
+              // Prioritize Mxf8f6f4 kernel
+              bool IsHmma2KBlock = (CtaK == 64);
+
+              if (IsHmma2KBlock)  { return compute_epi_tile_n(32, 2, d_bits); } 
+                                    return compute_epi_tile_n(16, 2, d_bits);
+            }
+          }
+          if (d_bits == 8) {
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n( 8, 2, d_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n( 8, 2, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(24, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(16, 2, d_bits); }
+          }
+        }
+        if (IsBsspMxf8f6f4) {
+          if (d_bits == 32) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(64, 4, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, d_bits); }
+          }
+          if (d_bits == 16) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, d_bits); }
+          }
+          if (d_bits == 8) {
+            if (Is1Sm && CtaN == 128) { 
+              // SFD VS64 require EpiTileN to be multiplier of 64
+              if (is_sfd_row_major_vs64) { return compute_epi_tile_n(24, 3, d_bits); }
+              else                       { return compute_epi_tile_n(12, 3, d_bits); }}
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(32, 4, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(16, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(24, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) {
+              // SFD VS64 require EpiTileN to be multiplier of 64
+              if (is_sfd_row_major_vs64) { return compute_epi_tile_n(16, 2, d_bits); }
+              else                       { return compute_epi_tile_n( 8, 2, d_bits); }}
+          }
+        }
+        if (IsBsspNvf4) {
+          if (d_bits == 32) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 512);
+              if (Is4KBlock) { return compute_epi_tile_n(64, 2, d_bits); }
+                               return compute_epi_tile_n(32, 2, d_bits);
+            }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 512);
+              if (Is4KBlock) { return compute_epi_tile_n(64, 2, d_bits); }
+                               return compute_epi_tile_n(48, 3, d_bits);
+            }
+          }
+          if (d_bits == 16) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, d_bits); }
+          }
+          if (d_bits == 4) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n( 8, 2, d_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(12, 3, d_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(16, 4, d_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n( 8, 2, d_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(12, 3, d_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(16, 4, d_bits); }
+          }
+        }
+        if (IsBsspMxf4) {
+          if (d_bits == 32) {
+            if (CtaN == 256) {
+              return compute_epi_tile_n(32, 2, d_bits);
+            }
+          }
+        }
+        // Fallback
+        return compute_epi_tile_n(16, 2, d_bits);
+      }
+      // NonVoidC Kernel
+      if (not DisableSource) {
+        auto d_bits = cute::sizeof_bits_v<ElementD>;
+        auto c_bits = cute::sizeof_bits_v<ElementC>;
+        if (IsSp) {
+          if (c_bits == 32 && d_bits == 32) {
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(64, 4, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 128) {
+              bool Is4KBlock = (CtaK == 64 || CtaK == 256);
+              if (Is4KBlock) { return compute_epi_tile_n(32, 4, c_bits); }
+                               return compute_epi_tile_n(64, 4, c_bits);
+            }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) {
+              bool IsTfmma2KBlock = (CtaK == 32);
+              if (IsTfmma2KBlock) { return compute_epi_tile_n(32, 4, c_bits); }
+                                    return compute_epi_tile_n(64, 4, c_bits);
+            }
+          }
+          if (c_bits == 16 && (d_bits == 16 || d_bits == 8)) {
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n(16, 4, c_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n(16, 4, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, c_bits); }
+          }
+          if (c_bits == 8 && d_bits == 8) {
+            // 8 bit C assume no SMEM reuse between C and D. Smem size mentioned below is ONLY for C.
+            if (Is1Sm && CtaN ==  64) { return compute_epi_tile_n( 8, 2, c_bits); }
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(16, 2, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(24, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN ==  64) { return compute_epi_tile_n( 8, 2, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(16, 2, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(24, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 256);
+              if (Is4KBlock) { return compute_epi_tile_n(32, 4, c_bits); }
+                               return compute_epi_tile_n(16, 2, c_bits);
+            }
+          }
+        }
+        if (IsBsspMxf8f6f4) {
+          if (c_bits == 32 && d_bits == 32) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(64, 4, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(64, 4, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, c_bits); }
+          }
+          if (c_bits == 16 && d_bits == 16) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, c_bits); }
+          }
+          if (c_bits == 16 && d_bits == 8) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is2Sm && CtaN == 192) {
+              // SFD VS64 require EpiTileN to be multiplier of 64
+              if (is_sfd_row_major_vs64) { return compute_epi_tile_n(64, 4, c_bits); }
+              else                       { return compute_epi_tile_n(32, 4, c_bits); }}
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(64, 4, c_bits); }
+          }
+        }
+        if (IsBsspNvf4) {
+          if (c_bits == 32 && d_bits == 32) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(64, 4, c_bits); }
+            if (Is1Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 512);
+              if (Is4KBlock) { return compute_epi_tile_n(64, 2, c_bits); }
+                               return compute_epi_tile_n(64, 4, c_bits);
+            }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(64, 4, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 512);
+              if (Is4KBlock) { return compute_epi_tile_n(64, 2, c_bits); }
+                               return compute_epi_tile_n(48, 3, c_bits);
+            }
+          }
+          if (c_bits == 16 && d_bits == 16) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is1Sm && CtaN == 256) {
+              bool Is4KBlock = (CtaK == 512);
+              if (Is4KBlock) { return compute_epi_tile_n(64, 4, c_bits); }
+                               return compute_epi_tile_n(32, 4, c_bits);
+            }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 4, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+          }
+          if (c_bits == 16 && d_bits == 4) {
+            if (Is1Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is1Sm && CtaN == 192) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is1Sm && CtaN == 256) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is2Sm && CtaN == 128) { return compute_epi_tile_n(32, 2, c_bits); }
+            if (Is2Sm && CtaN == 192) { return compute_epi_tile_n(48, 3, c_bits); }
+            if (Is2Sm && CtaN == 256) { return compute_epi_tile_n(48, 3, c_bits); }
+          }
+        }
+        if (IsBsspMxf4) {
+          if (c_bits == 32 && d_bits == 32) {
+            if (CtaN == 256) {
+              return compute_epi_tile_n(64, 4, d_bits);
+            }
+          }
+        }
+        // Fallback
+        return compute_epi_tile_n(32, 4, c_bits);
+      }
+    }();
+
+    // stride by tmem warp layout and return a by-mode tiler
+    auto tile_m = Layout<Int<EpiTileM>>{};
+    auto tile_n = Layout<Shape <Int<EpiTileN / WarpN>,Int<        WarpN>>,
+                         Stride<Int<               1>,Int<CtaN / WarpN>>>{};
+
+    return make_tile(tile_m, coalesce(tile_n));
+  }
+  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
+    return EpilogueTileType{};
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
+  }
+}
+
+template <
+  class CtaTileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementC,
+  class ElementD,
+  class EpilogueScheduleType
+>
+constexpr auto
+sm100_sparse_get_tma_dispatch_policy() {
+  using EpilogueTileShape_MN = decltype(product_each(shape(EpilogueTile_MN{})));
+  constexpr int EpiTiles = size(shape_div(take<0,2>(CtaTileShape_MNK{}), EpilogueTileShape_MN{}));
+  constexpr int FragmentSize = size(EpilogueTileShape_MN{}) / NumThreadsPerWarpGroup;
+  constexpr int CtaN = cute::size<1>(CtaTileShape_MNK{});
+  constexpr int CtaK = cute::size<2>(CtaTileShape_MNK{});
+
+  constexpr bool IsBsspMxf8f6f4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4, EpilogueScheduleType> ||
+                                  cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4, EpilogueScheduleType>;
+  constexpr bool IsBsspNvf4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmNvf4, EpilogueScheduleType> ||
+                              cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmNvf4, EpilogueScheduleType>;
+  constexpr bool IsBsspMxf4 = cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized1SmMxf4, EpilogueScheduleType> ||
+                              cute::is_same_v<cutlass::epilogue::TmaWarpSpecialized2SmMxf4, EpilogueScheduleType>;
+  constexpr bool IsBssp = (IsBsspMxf8f6f4 || IsBsspNvf4 || IsBsspMxf4);
+  constexpr bool IsSp = not IsBssp;
+  constexpr bool Is1Sm = cute::is_base_of_v<TmaWarpSpecialized1Sm, EpilogueScheduleType>;
+  constexpr bool Is2Sm = cute::is_base_of_v<TmaWarpSpecialized2Sm, EpilogueScheduleType>;
+
+  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
+  constexpr bool ReuseSmem = sizeof_bits_v<ElementC> > 8;
+
+  // TMA store delay performs worse with residual loads
+  constexpr bool DelayTmaStore = is_void_v<ElementC>;
+
+  constexpr auto ExpectedStagesD = [&]() constexpr -> int {
+    auto d_bits = cute::sizeof_bits_v<ElementD>;
+    auto c_bits = cute::sizeof_bits_v<ElementC>;
+    // None void_c kernel pick 2stageD here, in reality it may choose reuse smemC
+    if (not cute::is_void_v<ElementC>) {
+      return 2;
+    }
+    // Void_C kernel have fine tunned stageD
+    else {
+      if (IsSp) {
+        if (((d_bits == 32 || d_bits == 16) && ((Is1Sm && CtaN == 192) ||
+                                                (Is1Sm && CtaN == 256) ||
+                                                (Is2Sm && CtaN == 192))) ||
+            (d_bits == 8 && Is2Sm && CtaN == 192)) {
+          return 3;
+        }
+        return 2;
+      }
+      if (IsBsspMxf8f6f4) {
+        if (((d_bits == 32 || d_bits == 16) && Is1Sm && CtaN == 256) ||
+            (d_bits == 8 && ((Is1Sm && CtaN == 128) ||
+                             (Is2Sm && CtaN == 192)))) {
+          return 3;
+        }
+        if ((d_bits == 32 && Is1Sm && CtaN == 128) ||
+            (d_bits == 32 && Is2Sm && CtaN == 256) ||
+            (d_bits == 16 && Is2Sm && CtaN == 256) ||
+            (d_bits ==  8 && Is1Sm && CtaN == 256)) {
+          return 4;
+        }
+        return 2;
+      }
+      if (IsBsspNvf4) {
+        if ((d_bits == 32 && ((Is1Sm && CtaN == 128) ||
+                              (Is2Sm && CtaN == 192) ||
+                              (Is2Sm && CtaN == 256 && CtaK == 256))) ||
+            (d_bits == 16 && ((Is2Sm && CtaN == 192) ||
+                              (Is2Sm && CtaN == 256))) ||
+            (d_bits ==  4 && ((Is1Sm && CtaN == 192) ||
+                              (Is2Sm && CtaN == 192)))) {
+          return 3;
+        }
+        if ((d_bits == 4 && ((Is1Sm && CtaN == 256) ||
+                             (Is2Sm && CtaN == 256)))) {
+          return 4;
+        }
+        return 2;
+      }
+      return 2;
+    }
+  }();
+
+  constexpr auto ExpectedStagesC = [&]() constexpr -> int {
+    auto d_bits = cute::sizeof_bits_v<ElementD>;
+    auto c_bits = cute::sizeof_bits_v<ElementC>;
+    // Void_c kernel only use smemD. StageC doesn't matter
+    if (cute::is_void_v<ElementC>) {
+      return 4;
+    }
+    // None VoidC kernel have fine tunned stageC
+    else {
+      if (IsSp) {
+        if ((((c_bits == 32 && d_bits == 32) ||
+              (c_bits == 16 && d_bits == 16) ||
+              (c_bits == 16 && d_bits  == 8)) && ((Is1Sm && CtaN == 192) ||
+                                                  (Is1Sm && CtaN == 256) ||
+                                                  (Is2Sm && CtaN == 192))) ||
+            (c_bits == 8 && d_bits == 8 && ((Is1Sm && CtaN == 192) ||
+                                            (Is2Sm && CtaN == 192)))) {
+          return 3;
+        }
+        if (c_bits == 8 && d_bits == 8 && ((Is1Sm && CtaN ==  64) ||
+                                           (Is1Sm && CtaN == 128) ||
+                                           (Is2Sm && CtaN ==  64) ||
+                                           (Is2Sm && CtaN == 128) ||
+                                           (Is2Sm && CtaN == 256 && CtaK == 128))) {
+          return 2;
+        }
+        return 4;
+      }
+      if (IsBsspMxf8f6f4) {
+        if ((c_bits == 32 && d_bits == 32 && Is1Sm && CtaN == 256) ||
+            (c_bits == 16 && d_bits == 16 && ((Is1Sm && CtaN == 192) ||
+                                              (Is1Sm && CtaN == 256))) ||
+            (c_bits == 16 && d_bits ==  8 && ((Is1Sm && CtaN == 128) ||
+                                              (Is1Sm && CtaN == 256) ||
+                                              (Is1Sm && CtaN == 128) ||
+                                              (Is2Sm && CtaN == 128)))) {
+          return 3;
+        }
+        return 4;
+      }
+      if (IsBsspNvf4) {
+        if ((c_bits == 32 && d_bits == 32 && ((Is1Sm && CtaN == 128) ||
+                                              (Is2Sm && CtaN == 192) ||
+                                              (Is2Sm && CtaN == 256))) ||
+            (c_bits == 16 && d_bits == 16 && ((Is1Sm && CtaN == 192) ||
+                                              (Is2Sm && CtaN == 192) ||
+                                              (Is2Sm && CtaN == 256))) ||
+            (c_bits == 16 && d_bits == 4 && ((Is2Sm && CtaN == 192) ||
+                                             (Is2Sm && CtaN == 256)))) {
+          return 3;
+        }
+        if ((c_bits == 32 && d_bits == 32 && CtaN == 256 && CtaK == 512 ) ||
+            (c_bits == 16 && d_bits == 4 && ((Is1Sm && CtaN == 128) ||
+                                             (Is1Sm && CtaN == 192) ||
+                                             (Is1Sm && CtaN == 256) ||
+                                             (Is2Sm && CtaN == 128)))) {
+          return 2;
+        }
+        return 4;
+      }
+      return 4;
+    }
+  }();
+
+  constexpr int StagesD = cute::min(EpiTiles, ExpectedStagesD);
+  constexpr int StagesC = cute::min(EpiTiles, ExpectedStagesC);
+
+  using DispatchPolicy = Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>;
+  return DispatchPolicy{};
+}
+
+} // namespace sparse
+
 /*
  * Returns the TMEM_LOAD copy op to be used for the epilogue
  * Returned TMEM_LOAD op is such that the thread-value ownership matches the widest available
@@ -512,7 +988,14 @@ private:
   epilogue_tile() {
     using namespace cute;
     
-    if constexpr (is_same_v<OpClass, arch::OpClassBlockScaledTensorOp> && 
+    if constexpr (is_same_v<OpClass, arch::OpClassSparseTensorOp> ||
+                  is_same_v<OpClass, arch::OpClassBlockScaledSparseTensorOp>) {
+      return detail::sparse::sm100_sparse_compute_tile_shape_or_override<
+                        CtaTileShape_MNK, EpilogueTileType, TmemWarpShape_MN,
+                        ElementC_, GmemStrideTypeC, ElementD, GmemStrideTypeD, Schedule,
+                        FusionOp>();
+    }
+    else if constexpr (is_same_v<OpClass, arch::OpClassBlockScaledTensorOp> && 
                   is_same_v<EpilogueTileType, EpilogueTileAuto> && 
                   size<1>(CtaTileShape_MNK{}) == 256) {
       constexpr int CtaM = size<0>(CtaTileShape_MNK{});
@@ -613,8 +1096,12 @@ private:
     constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
                                       : cute::min(EpiTiles, 4);
 
-    if constexpr (is_same_v<Schedule, PtrArrayTmaWarpSpecialized1Sm> ||
-                  is_same_v<Schedule, PtrArrayTmaWarpSpecialized2Sm>) {
+    if constexpr (is_same_v<OpClass, arch::OpClassSparseTensorOp> ||
+                  is_same_v<OpClass, arch::OpClassBlockScaledSparseTensorOp>) {
+      return detail::sparse::sm100_sparse_get_tma_dispatch_policy<CtaTileShape_MNK, EpilogueTile_MN, ElementC_, ElementD, Schedule>();
+    }
+    else if constexpr (is_same_v<Schedule, PtrArrayTmaWarpSpecialized1Sm> ||
+                       is_same_v<Schedule, PtrArrayTmaWarpSpecialized2Sm>) {
       constexpr bool DelayTmaStore_ = false; // TMA store delay complicates tensormap updates for Ptr-Array GEMMs
       return Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore_>{};
     }
diff --git a/include/cutlass/epilogue/collective/builders/sm120_builder.inl b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
new file mode 100644
index 00000000..ad1f44a0
--- /dev/null
+++ b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/collective/builders/sm90_common.inl"
+#include "cutlass/epilogue/collective/builders/sm120_common.inl"
+
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#else
+#include <type_traits>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Helper structs for getting the SF vector size used by the epilogue, if one is used
+template <class FusionOp, class = void>
+struct EpilogueSFVecSize {
+  static constexpr int value = 0;
+};
+
+template <class FusionOp>
+struct EpilogueSFVecSize<FusionOp, cute::void_t<decltype(FusionOp::SFVecSize)>> {
+  static constexpr int value = FusionOp::SFVecSize;
+};
+
+// Returns the parameterized dispatch policy for the TMA epilogue
+template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class StrideD, class Schedule>
+constexpr auto
+sm120_get_tma_dispatch_policy() {
+  using namespace cute;
+
+  constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
+
+  // For 120, a FragmentSize of 4 is used to match the
+  // output per thread from each MMA. Epilogue subtiles iterate over multiple of these
+  // fragments before storing the subtile's outputs to shared memory.
+  constexpr int FragmentSize = 4;
+
+  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
+  constexpr bool ReuseSmem = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
+  constexpr bool DelayTmaStore = is_void_v<ElementC>; // TMA store delay performs worse with residual loads
+
+  constexpr bool IsFP6 = cute::is_same_v<ElementD, cutlass::float_e3m2_t> || cute::is_same_v<ElementD, cutlass::float_e2m3_t>;
+  constexpr bool IsRowMajorD = cutlass::gemm::detail::is_major<1, StrideD>();
+  constexpr int StagesD = (IsFP6 && IsRowMajorD) ? 1 : cute::min(EpiTiles, 2);
+
+  // SM120 epilogues use smaller stage counts in order to fit within the limited shared memory capacity.
+  constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 2), StagesD+1)
+                                    : StagesD;
+                                    
+  return Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+}
+
+// Returns the smem layout atom to be used for C or D matrix
+template<class GmemStrideType, class Element_, class EpilogueTile_MN>
+constexpr auto
+sm120_get_epilogue_smem_swizzle_layout_atom() {
+  using namespace cute;
+
+  // FP6 data is always stored in 8-bit containers in the epilogue
+  using Element = cute::conditional_t<
+    cute::is_same_v<Element_, cutlass::float_e3m2_t> || cute::is_same_v<Element_, cutlass::float_e2m3_t>,
+    uint8_t,  Element_
+  >;
+
+  // ColMajor C/D (M-major)
+  if constexpr (cutlass::gemm::detail::is_major<0>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::MN, Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  // RowMajor C/D (N-major)
+  else if constexpr (cutlass::gemm::detail::is_major<1>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::K , Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<GmemStrideType>, "Unsupported gmem layout.");
+  }
+}
+
+template <class ElementC, class ElementD, class EpilogueTileType, class Schedule, class TileShape_MNK, class StrideD, class FusionOp>
+constexpr auto
+sm120_compute_tile_shape_or_override() {
+
+  constexpr int CTA_M = size<0>(TileShape_MNK{});
+  constexpr int CTA_N = size<1>(TileShape_MNK{});
+
+  constexpr bool IsFP6 = cute::is_same_v<ElementD, cutlass::float_e3m2_t> || cute::is_same_v<ElementD, cutlass::float_e2m3_t>;
+  
+  constexpr bool IsColMajorD = cutlass::gemm::detail::is_major<0, StrideD>();
+  constexpr bool IsRowMajorD = cutlass::gemm::detail::is_major<1, StrideD>();
+  static_assert(IsColMajorD || IsRowMajorD, "SM120 LayoutD must be either row or column major.");
+  
+  static_assert(!IsFP6 ||
+              (CTA_M % 128 == 0 && IsColMajorD) ||
+              (CTA_N % 128 == 0 && IsRowMajorD),
+              "CTA tile for FP6 ElementD must have a contiguous extent that is a multiple of 128.");
+
+  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+    // If ElementD is FP6, use an epilogue subtile with an extent
+    // of 128 along the continuous dimension to meet TMA requirements.
+    if constexpr (IsFP6) {
+      if constexpr (IsRowMajorD) {
+        return Shape<_64, _128>{};
+      }
+      else {
+        return Shape<_128, _32>{};
+      }
+    }
+    else {
+      if constexpr (cute::is_same_v<Schedule, SparseTmaWarpSpecializedCooperativeSm120>) {
+        // sm120 sparse kernels require more shared memory budget than dense kernels in the mainloop 
+        // so selecting a smaller EpilogueTileN (16) for some cases.
+        if constexpr (FusionOp::SFVecSize == 64 && IsRowMajorD) {
+          return Shape<_32, _64>{};
+        }
+        else {
+          constexpr int M = 64;
+          constexpr int N = cute::is_void_v<ElementC>
+            // When C is void, let N = 16 when D is fp32 for lesser SMEM consumption, otherwise 32.
+            ? cute::sizeof_bits_v<ElementD> == 32 ? 16 : 32 
+            // When C is not void
+            : cute::sizeof_bits_v<ElementC> <= 16
+              ? 32 // 16-bit or smaller C needs lesser SMEM for epilogue so we keep N = 32
+              : 16; // 32-bit needs to let N = 16
+          return Shape<Int<M>, Int<N>>{};
+        }
+      }
+      else {
+        return Shape<_64, _32>{};
+      }
+    }
+  } // EpilogueTileAuto
+  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
+    static_assert(!is_layout<EpilogueTileType>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+
+    EpilogueTileType epi_tile;
+    constexpr int M = size<0>(shape(epi_tile));
+    constexpr int N = size<1>(shape(epi_tile));
+
+    static_assert(!IsFP6 ||
+                  (M % 128 == 0 && IsColMajorD) ||
+                  (N % 128 == 0 && IsRowMajorD),
+                  "EpilogueTile for narrow ElementD must have a contiguous extent that is a multiple of 128.");
+
+    static_assert(CTA_M % M == 0 && CTA_N % N == 0, "EpilogueTile must evenly divide CTA tile");
+
+    return epi_tile;
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
+  }
+}
+
+template <class GmemStrideTypeD, class ElementD>
+constexpr auto
+sm120_get_register_transform_op() {
+  using namespace cute;
+
+  [[maybe_unused]] constexpr bool is_m_major = cutlass::detail::is_major<0>(GmemStrideTypeD{});
+  [[maybe_unused]] constexpr bool is_n_major = cutlass::detail::is_major<1>(GmemStrideTypeD{});
+  static_assert(is_m_major || is_n_major, "Unsupported gmem layout");
+
+  if constexpr (sizeof_bits_v<ElementD> == 4 && is_m_major) {
+    // Before store fp4 along M major, row0 column{0,1} is kept in one thread, and row1 column{0,1}
+    // is kept in another thread. It is expected to have row{0,1} column0 in one thread,
+    // while row{0,1} column1 in another thread, so that the store could keep granularity
+    // 8bits at least. The shuffle is a 2x2 transpose, like below diagram, switching N major to
+    // M major from a register view.
+    //
+    // Before                            After
+    //         Column0   Column1                 Column0   Column1
+    //  Row0    d0(t0)   d1(t0)           Row0    d0(t0)    d0(t4)
+    //  Row1    d0(t4)   d1(t4)           Row1    d1(t0)    d1(t4)
+    //
+    return SM50_Shuffle_U32_2x2Trans_XOR4{};
+  }
+  else {
+    return; // void
+  }
+}
+
+// Overload CallbacksBuilder to pick the correct copy atoms
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class AccLoadOp,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  AccLoadOp,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && not cute::is_subbyte_v<typename FusionOp::ElementAux>>
+> {
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<typename FusionOp::GmemLayoutTagAux>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
+
+  using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator<GmemStrideTypeAux, typename FusionOp::ElementAux>());
+
+  using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source<GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  
+  using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
+
+  using Callbacks = fusion::FusionCallbacks<
+    Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+};
+
+
+// Helper for building TMA warp-specialized collective epilogues, specialized by
+// the fusion operation performed and the dispatch policy to use.
+template <
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD_,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class FusionOpOrCallbacks,
+  class DispatchPolicy
+>
+struct Sm120TmaBuilderImpl {
+  // Passing void D disables destination store + smem allocation
+  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
+                     fusion::get_element_aux_t<FusionOpOrCallbacks>, ElementD_>;
+
+  // Passing void C disables source load + smem allocation
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
+
+  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
+
+  using CopyOpS2G =
+    cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagD>,
+      SM90_TMA_STORE_IM2COL,
+      SM90_TMA_STORE
+    >;
+
+  using CopyOpG2S =
+    cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagC>,
+      SM90_TMA_LOAD_IM2COL,
+      SM90_TMA_LOAD
+    >;
+
+  // Get the smallest tiled copy we can use to retile the accumulators
+  using CopyAtomC = Copy_Atom<SM90_U32x2_STSM_N, cutlass::half_t>;
+
+  using SmemLayoutAtomC = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>());
+  using SmemLayoutAtomD = decltype(detail::sm120_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>());
+
+  using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>());
+
+  using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>());
+
+  // Get register to register tiled copy that happen before shared memory store.
+  using CopyOpR2R = decltype(detail::sm120_get_register_transform_op<GmemStrideTypeD, ElementD>());
+
+  // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
+  // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
+  using FusionCallbacks =
+    typename CallbacksBuilder<
+      DispatchPolicy,
+      FusionOpOrCallbacks,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator
+    >::Callbacks;
+
+  // Re-use Sm90 collective epilogue implementation
+  constexpr static int StagesC = DispatchPolicy::StagesC;
+  constexpr static int  StagesD = DispatchPolicy::StagesD;
+  constexpr static int  FragmentSize = DispatchPolicy::FragmentSize;
+  constexpr static bool ReuseSmemC = DispatchPolicy::ReuseSmemC;
+  constexpr static bool DelayTmaStore = DispatchPolicy::DelayTmaStore;
+
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC,StagesD,FragmentSize,ReuseSmemC,DelayTmaStore>,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // Need to pass void through to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD_,
+      GmemStrideTypeD,
+      FusionCallbacks,
+      CopyOpG2S,
+      SmemLayoutAtomC,
+      CopyOpS2R,
+      CopyOpS2G,
+      SmemLayoutAtomD,
+      CopyOpR2S,
+      CopyAtomC,
+      CopyOpR2R
+    >;
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Tma warp-specialized builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class FusionOperation
+>
+struct CollectiveBuilder<
+    arch::Sm120,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    FusionOperation,
+    cute::enable_if_t<cute::is_same_v<Schedule, EpilogueScheduleAuto> ||
+                      cute::is_same_v<Schedule, TmaWarpSpecialized> ||
+                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> ||
+                      cute::is_same_v<Schedule, SparseTmaWarpSpecializedCooperativeSm120>
+                     >> {
+private:
+  using EpilogueTile_MN =
+    decltype(detail::sm120_compute_tile_shape_or_override<ElementC, ElementD, EpilogueTileType, Schedule, TileShape_MNK, cutlass::detail::TagToStrideC_t<GmemLayoutTagD>, FusionOperation>());
+  using DispatchPolicy =
+    decltype(detail::sm120_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD, cutlass::detail::TagToStrideC_t<GmemLayoutTagD>, Schedule>());
+
+
+public:
+  using CollectiveOp =
+    typename detail::Sm120TmaBuilderImpl<
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD,
+      GmemLayoutTagD,
+      AlignmentD,
+      FusionOperation,
+      DispatchPolicy
+    >::CollectiveOp;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
diff --git a/include/cutlass/epilogue/collective/builders/sm120_common.inl b/include/cutlass/epilogue/collective/builders/sm120_common.inl
new file mode 100644
index 00000000..5b8779d6
--- /dev/null
+++ b/include/cutlass/epilogue/collective/builders/sm120_common.inl
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Selects the largest vectorized smem store atom available
+template <class GmemStrideTypeD, class ElementD>
+constexpr auto
+sm120_get_smem_store_op_for_accumulator() {
+  using namespace cute;
+
+  if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) {
+    return SM90_U16x4_STSM_T{};
+  }
+  else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) {
+    return SM90_U32x2_STSM_N{};
+  }
+  else {
+    // auto-vectorizing store
+    return AutoVectorizingCopyWithAssumedAlignment{};
+  }
+}
+
+// Selects the largest vectorized smem load atom available
+template <class GmemStrideTypeC, class ElementC>
+constexpr auto
+sm120_get_smem_load_op_for_source() {
+  using namespace cute;
+
+  if constexpr (sizeof(ElementC) == 2) {
+    if constexpr (size<0>(GmemStrideTypeC{}) == 1) {
+      return SM75_U16x4_LDSM_T{};
+    }
+    else if constexpr (size<1>(GmemStrideTypeC{}) == 1) {
+      return SM75_U32x2_LDSM_N{};
+    }
+  }
+  else {
+    // auto-vectorizing load
+    return AutoVectorizingCopyWithAssumedAlignment<128>{};
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective::detail
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
index 0f11bc34..f6844375 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -773,7 +773,7 @@ template <
   class ElementD,
   class GmemLayoutTagD,
   int AlignmentD,
-  FloatRoundStyle RoundStyle
+  class FusionOperation
 >
 struct CollectiveBuilder<
     arch::Sm90,
@@ -790,8 +790,11 @@ struct CollectiveBuilder<
     GmemLayoutTagD,
     AlignmentD,
     cutlass::gemm::EpilogueTransposed,
-    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
+    FusionOperation,
     void> {
+private:
+  static_assert(cute::is_same_v<FusionOperation, fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute>>,
+                "EpilogueTransposed schedule doesn't support fusion.");
   // Passing void C disables source load
   using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
       ElementD, ElementC_>; // prevents cute breakages
@@ -803,8 +806,9 @@ struct CollectiveBuilder<
   static constexpr int FragmentSize = 1;
   using ThreadOp = thread::LinearCombination<
     ElementD, FragmentSize, ElementAccumulator, ElementCompute,
-    ScaleType, RoundStyle, ElementC>;
+    ScaleType, cutlass::FloatRoundStyle::round_to_nearest, ElementC>;
 
+public:
   using CollectiveOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
     cutlass::epilogue::collective::DefaultEpilogue<
       ElementC_,
diff --git a/include/cutlass/epilogue/collective/collective_builder.hpp b/include/cutlass/epilogue/collective/collective_builder.hpp
index 10643c23..bb55c96d 100644
--- a/include/cutlass/epilogue/collective/collective_builder.hpp
+++ b/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -120,4 +120,6 @@ struct CallbacksBuilder<
 
 #include "builders/sm90_builder.inl"
 #include "builders/sm100_builder.inl"  
+#include "builders/sm120_builder.inl"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp
index 45ebd184..b7bd6f40 100644
--- a/include/cutlass/epilogue/collective/default_epilogue.hpp
+++ b/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -158,7 +158,7 @@ public:
     class TiledMma,
     class ResidueMNK
   >
-  CUTLASS_HOST_DEVICE void
+  CUTLASS_DEVICE void
   operator()(
       ProblemShapeMNKL problem_shape_mnkl,
       BlockShapeMNK blk_shape_MNK,
@@ -207,16 +207,29 @@ public:
     CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
         "Accumulator count must have the same destination element count.");
 
+    // OOB predication for tile quantization "residue"
     // Absolute coordinate tensors (dynamic)
-    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    auto shape_MN = make_shape(M,N);
+    Tensor mD_crd = make_identity_tensor(shape_MN);                                                            // (M,N)
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(blk_shape_MNK), make_coord(m_coord, n_coord));         // (BLK_M,BLK_N)
-    Tensor tCcD = thr_mma.partition_C(cD_mn);                                                      // (VEC,THR_M,THR_N)
+    Tensor tCcD_mn = thr_mma.partition_C(cD_mn);                                                   // (VEC,THR_M,THR_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (BLK_M,BLK_N)
+    Tensor tCcD = make_counting_tensor(tCcD_mn.layout());                                          // (VEC,THR_M,THR_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = shape_MN - cD_mn(_0{});                                                                  // (m,n)
+    auto residue_tCcD = shape_MN - tCcD_mn(_0{});                                                              // (m,n)
+
+    // Fully OOB tile
+    if (not elem_less(repeat_like(residue_cD, _0{}), residue_cD)) {
+      return;
+    }
 
     // source is needed
     if (epilogue_op.is_source_needed()) {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_shape(M,N))) {
+        if (elem_less(tCcD(i), residue_tCcD)) {
           tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
         }
       }
@@ -225,7 +238,7 @@ public:
     else {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_shape(M,N))) {
+        if (elem_less(tCcD(i), residue_tCcD)) {
           tCgD(i) = epilogue_op(accumulators(i));
         }
       }
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
index 1354349d..c142a0b7 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
@@ -188,6 +188,7 @@ public:
   using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
   using LoadPipelineState = cutlass::PipelineState<StagesC>;
   constexpr static uint32_t TmaTransactionBytes = StageCBits / 8;
+  constexpr static uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   // TMA pipeline for storing D
   using StorePipeline = cute::conditional_t<ReuseSmemC,
@@ -277,8 +278,9 @@ public:
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
     auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
-    auto init_M = get<0>(init_shape);
-    auto init_N = get<1>(init_shape);
+    constexpr int tma_alignment_bits = 128;
+    auto init_M = tma_alignment_bits;
+    auto init_N = tma_alignment_bits;
     auto init_L = 1;
 
     InternalStrideC stride_c;
@@ -312,7 +314,7 @@ public:
     typename Params::TMA_D tma_store_d = make_tma_copy(CopyOpS2G{}, tensor_d, SmemLayoutStageD{}, EpilogueTile{}, _1{});
 
     auto fusion_workspace = static_cast<char*>(workspace);
-    auto fusion_workspace_size = FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    auto fusion_workspace_size = round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment);
     auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
                                         static_cast<char*>(workspace) + fusion_workspace_size);
 
@@ -334,7 +336,7 @@ public:
     constexpr uint32_t NumInputTensors = cute::is_void_v<ElementC> ? 1 : 2;
     constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
     // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
-    return (NumInputTensors * SizeOfCuTensorMap * sm_count) + FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count) + (round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment));
   }
 
   template <class ProblemShape>
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
index cb9cc606..778f8769 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
@@ -740,7 +740,6 @@ public:
           using VecType = uint_bit_t<VD * sizeof_bits_v<ElementD>>;
             Tensor tTR_gD_frg = recast<VecType>(coalesce(tTR_gD(_,_,_,epi_m,epi_n)));
             Tensor tTR_rD_frg = recast<VecType>(coalesce(tTR_rD));
-
             copy_if(pred_fn_D, tTR_rD_frg, tTR_gD_frg);
         } // for epi_m
       } // for epi_n
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
index c3893675..b27ec712 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -221,6 +221,7 @@ public:
   constexpr static bool RequiresTransactionBytes = true;
 
   constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+  constexpr static uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   // TMA pipeline for storing D
   using StorePipeline = cute::conditional_t<ReuseSmemC,
@@ -330,7 +331,8 @@ public:
     uint32_t transaction_bytes = TmaTransactionBytes;
     typename Params::TMA_C tma_load_c{};
     if constexpr (is_source_supported) {
-      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(args.ptr_C); 
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(reinterpret_cast<uint64_t>(args.ptr_C) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
       Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
       tma_load_c = make_tma_copy(
           CopyOpG2S{},
@@ -342,7 +344,8 @@ public:
 
     typename Params::TMA_D tma_store_d{};
     if constexpr (is_destination_supported) {
-      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(args.ptr_D);
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(reinterpret_cast<uint64_t>(args.ptr_D) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
       Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
       tma_store_d = make_tma_copy(
           CopyOpS2G{},
@@ -353,7 +356,7 @@ public:
     }
 
     auto fusion_workspace = static_cast<char*>(workspace);
-    auto fusion_workspace_size = FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    auto fusion_workspace_size = round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment);
     auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
                                       static_cast<char*>(workspace) + fusion_workspace_size);
 
@@ -373,13 +376,12 @@ public:
   template <class ProblemShape>
   static size_t
   get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
-    
     constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
     auto descriptors_shape = cute::make_shape(sm_count, Int<NumInputTensors>{});
     constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
-
     // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
-    return (size(descriptors_shape) * SizeOfCuTensorMap) + FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    return (size(descriptors_shape) * SizeOfCuTensorMap) + 
+        (round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment));
   }
 
   template <class ProblemShape>
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index 83302627..f244fafa 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -41,6 +41,7 @@
 #include "cutlass/epilogue/thread/scale_type.h"
 #include "cutlass/epilogue/fusion/callbacks.hpp"
 #include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp"
 #include "cutlass/detail/collective.hpp"
 #include "cutlass/detail/layout.hpp"
 #include "cutlass/detail/helper_macros.hpp"
@@ -669,7 +670,14 @@ public:
 
     CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
 
-    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+      // When the epilogue subtile is larger than the MMA tiles, loop over multiple MMA tiles
+      CUTE_STATIC_ASSERT(epi_tile_n % mma_tile_n == 0, "MMA_TILE_N must divide EPI_TILE_N");
+    }
+    else {
+      CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    }
+
     // Get TiledCopy for partition reference when consumer store.
     TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
     // Get the fusion callbacks for the consumer store warps
@@ -793,6 +801,10 @@ public:
           if (is_C_load_needed) {
             // Copy source tile from smem to register
             copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+            // Ensure smem loads are complete before reusing smem for mixed types/layouts
+            if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+              synchronize();
+            }
           }
         }
 
@@ -809,17 +821,41 @@ public:
           ++load_wait_state;
         }
 
-        int mma_m = epi_m;
-        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
-        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+        if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+          // When the epilogue subtile is larger than the MMA tiles, loop over multiple
+          // MMA tiles
+          static constexpr int MmaMPerEpiM = epi_tile_m / mma_tile_m;
+          static constexpr int MmaNPerEpiN = epi_tile_n / mma_tile_n;
 
-        // Vectorized fragment loop with visitor callback entry point
-        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
-        int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
-          tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_n_in_epi = 0; mma_n_in_epi < MmaNPerEpiN; ++mma_n_in_epi) {
+            int mma_n = (epi_n * MmaNPerEpiN) + mma_n_in_epi;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m_in_epi = 0; mma_m_in_epi < MmaMPerEpiM; ++mma_m_in_epi) {
+              int mma_m = (epi_m * MmaMPerEpiM) + mma_m_in_epi;
+              Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+              int idx_in_epi_subtile = (mma_n_in_epi * MmaMPerEpiM + mma_m_in_epi);
+
+              tRS_rCompute_frg(idx_in_epi_subtile) = cst_callbacks.visit(
+                tRS_rAcc_frg_mn(0), idx_in_epi_subtile, epi_m, epi_n);
+            }
+          }
         }
+        else {
+          int mma_m = epi_m;
+          int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+          Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+          // Vectorized fragment loop with visitor callback entry point
+          int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+          int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+            tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          }
+        }
+
         // The latest we can delay the TMA store is right before the smem store of the next iteration
         // since the current TMA store needs to be committed before we can acquire the next smem buffer
         if constexpr (DelayTmaStore) {
diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp
index bd083c80..870be4c2 100644
--- a/include/cutlass/epilogue/dispatch_policy.hpp
+++ b/include/cutlass/epilogue/dispatch_policy.hpp
@@ -68,6 +68,15 @@ struct TmaWarpSpecialized1Sm {};
 struct TmaWarpSpecialized2Sm {};
 struct PtrArrayTmaWarpSpecialized1Sm : TmaWarpSpecialized1Sm {};
 struct PtrArrayTmaWarpSpecialized2Sm : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmNvf4     final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmNvf4     final : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmMxf4     final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmMxf4     final : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmMxf8f6f4 final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmMxf8f6f4 final : TmaWarpSpecialized2Sm {};
+// Cooperative epilogue schedule for sm120 sparse kernels
+struct SparseTmaWarpSpecializedCooperativeSm120 : public TmaWarpSpecializedCooperative {};
+
 // DEPRECATED schedules, will be removed in next release
 struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
 struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
@@ -231,6 +240,21 @@ struct Sm100NoSmemWarpSpecialized {};
 struct Sm100PtrArrayNoSmem {};
 struct Sm100PtrArrayNoSmemWarpSpecialized {};
 
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm120TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
 //////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::epilogue
diff --git a/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp
index 8ec31ee6..d81e3b4d 100644
--- a/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp
@@ -233,6 +233,89 @@ struct FusionCallbacks<
   using Impl::Impl;
 };
 
+// D = alpha * acc + beta * C
+// With Col BlockScaleFactor Generation.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinearCombColBlockScaleFactor =
+  Sm90EVT<Sm100BlockScaleFactorColStore<SFVecsize, EpilogueTile, ElementOutput, ElementCompute, ElementBlockScaleFactor, RoundStyle>, // gen scalefactor
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::ColumnMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl =  Sm100LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::ColumnMajor,  ElementSource, ElementScalar, RoundStyle>;  
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // For Ptr-Array and Grouped GEMM
@@ -544,6 +627,129 @@ struct FusionCallbacks<
   using Impl::Impl;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// D = alpha * acc + beta * C + per-row bias
+//   with col blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, ElementOutput, 
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm100LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -805,9 +1011,138 @@ struct FusionCallbacks<
   using Impl::Impl;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias) 
+//   with col blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, 
+      ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasEltActColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm100LinCombPerRowBiasEltActColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
 
 
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
 
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp
index 3c5b6272..da8dca23 100644
--- a/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp
@@ -281,7 +281,7 @@ struct Sm100BlockScaleFactorRowStore {
 
     auto [M, N, K, L] = args.problem_shape_mnkl;
     auto [tile_coord_m, tile_coord_n, tile_coord_k, tile_coord_l] = args.tile_coord_mnkl;
-    using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<SFVecSize>;
+    using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize>;
     UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
     // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
     if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
@@ -293,7 +293,7 @@ struct Sm100BlockScaleFactorRowStore {
     }
 
     auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
-    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
     static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
     Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_,_,tile_coord_l));                   // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
     Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
@@ -333,6 +333,331 @@ struct Sm100BlockScaleFactorRowStore {
   }
 };
 
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm100BlockScaleFactorColStore {
+
+  static_assert(size<0>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<0>(EpilogueTile{}) / SFVecSize == 1 or
+                size<0>(EpilogueTile{}) / SFVecSize == 2 or
+                size<0>(EpilogueTile{}) / SFVecSize == 4 or
+                size<0>(EpilogueTile{}) / SFVecSize == 8,
+                "Possible store in interleaved 4B aligned format");
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+  static constexpr int NumSyncWarps = SFVecSize == 64 ? 4 : 0;
+  static constexpr int NumSyncThreads = NumSyncWarps * NumThreadsPerWarp;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncWarps> smem_aux;
+  };
+
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = nullptr;
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+
+  using Params = Arguments;
+
+  // BlockScaleFactor generation is per batch or group
+  // For Ptr-Array GEMM and Grouped GEMM, ElementBlockScaleFactor is ElementType*
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (M % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm100BlockScaleFactorColStore] M-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorColStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorColStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class EpiTileCoordMN,
+    class ElementType
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    // Normally, we should use tile_shape_mnk to tile the gtensor.
+    // However, the SF gtensor could not be divisible by non-pow2 cta tile, so we use epi tile (pow2) to do tiling.
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,                       // (CPY,CPY_M,CPY_N)
+          GTensor&& tC_gSFD_,                       // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+          STensor&& sAmaxs_,                        // (NumSyncWarps)
+          CoordGTensor tC_cSFD_,                    // (m,n)
+          ThrResidue residue_tC_cSFD_,              // (m,n)
+          Params const* params_ptr_,
+          EpiTileCoordMN epi_tile_coord_mn_,        // (epi_tile_coord_m, epi_tile_coord_n)
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , epi_tile_coord_mn(epi_tile_coord_mn_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    EpiTileCoordMN epi_tile_coord_mn;
+
+    CUTLASS_DEVICE
+    ElementCompute find_amax(ElementCompute max) {
+      // Overall idea: after TMEM_LOAD.32DP32bit pattern, each thread in the warp can load adjacent elements of a column into its private RF.
+      //               Here we are using shuffle instructons to the amax value of the adjacent column elements.
+      // For VS16, t0~t15 would generate an amax, and t16~t31 would generate another one.
+      // For VS32, t0~t31 should generate an amax.
+      // For VS64, t0~t63 should generate an amax. We would first do the reduciton within a warp,
+      //           and then use smem to do inter-warp reduction.
+      if constexpr (SFVecSize == 32) {
+        return cutlass::redux_abs_max_nan_propagation_sync_warp<ElementCompute>{}(max);
+      }
+      else if constexpr (SFVecSize == 16) {
+        return cutlass::redux_abs_max_nan_propagation_sync_warp_t0t15_t16t31<ElementCompute>{}(max);
+      }
+      else if constexpr (SFVecSize == 64) {
+        // Get abs_max per warp
+        auto abs_max = cutlass::redux_abs_max_nan_propagation_sync_warp<ElementCompute>{}(max);
+
+        // Switch the amax of adjacent warps
+        const bool leading_thread = (threadIdx.x % NumThreadsPerWarp) == 0;
+        const int warp_idx = threadIdx.x / NumThreadsPerWarp % 4;
+        auto synchronize = [] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(NumSyncThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+        // Inter-warp reduction for VS=64
+        // Only 4 * FP32  = 16 bytes smem is needed as we have 4 warps.
+        if (leading_thread) {
+          sAmaxs(warp_idx) = abs_max;
+        }
+        synchronize();
+        // Switch data between two adjacent warps to do reduction
+        float tmp = sAmaxs(warp_idx^1);
+        synchronize();
+        abs_max  = cutlass::maximum_with_nan_propagation<ElementCompute>{}(abs_max,tmp);
+        return abs_max;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<ElementCompute>, "Unsupported VecSize");
+      }
+    }
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE auto
+    compute_quantized_value(Array<ElementCompute, FragmentSize> compute, Array<UnderlyingElementBlockScaleFactor, FragmentSize> sf) {
+      cutlass::multiplies<Array<ElementCompute, FragmentSize>> mul_array;
+      auto qpvscale_rcp = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+        if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
+          // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+          auto e8m0_qpvscale_rcps = cutlass::reciprocal_approximate<Array<UnderlyingElementBlockScaleFactor, FragmentSize>>{}(sf);
+          return cutlass::NumericArrayConverter<ElementCompute, UnderlyingElementBlockScaleFactor, FragmentSize>{}(e8m0_qpvscale_rcps);
+        }
+        else {
+          // UE4M3: Do the rcp in fp32 data type.
+          auto qpvscale_up = cutlass::NumericArrayConverter<ElementCompute, UnderlyingElementBlockScaleFactor, FragmentSize>{}(sf);
+          return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+        }
+      }();
+      // norm_constant and qpvscale_rcps[sf_v] are all positive numbers.
+      auto acc_scale = mul_array(norm_constant, qpvscale_rcp);
+      // Map INF to fp32::max
+      acc_scale = minimum_with_nan_propagation<decltype(acc_scale)>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+      return mul_array(compute, acc_scale);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input)
+    {
+      constexpr int NumVecs = 1; // each thread only compute 1 col scalefactors
+      Array<ElementCompute, FragmentSize> frg_compute;
+      Array<ElementOutput, FragmentSize> frg_output;
+      Array<ElementCompute, FragmentSize> frg_scale_float;
+      Array<ElementCompute, FragmentSize> frg_amax;
+      Array<UnderlyingElementBlockScaleFactor, FragmentSize> frg_scale;
+
+      Tensor tC_rSFD_frg = recast<cutlass::Array<UnderlyingElementBlockScaleFactor, NumVecs>>(coalesce(filter(tC_rSFD)));               // (EPI_V)
+
+      cutlass::multiplies<ElementCompute> mul;
+      cutlass::multiplies<Array<ElementCompute, FragmentSize>> mul_array;
+      /// convert acc to Element Compute
+      auto compute_frgs = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize>{}(frg_input);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        /// Step1: get max across a vector
+        frg_amax[i] = find_amax(compute_frgs[i]);
+      }
+      
+      frg_scale_float = mul_array(frg_amax, norm_constant_scaled_down);
+      frg_scale = cutlass::NumericArrayConverter<UnderlyingElementBlockScaleFactor, ElementCompute, FragmentSize>{}(frg_scale_float);
+      auto tC_cSFD_pred = tC_cSFD(_,_,_,epi_m,epi_n);
+      auto tC_gSFD_store = tC_gSFD(_,_,_,_,_,get<0>(epi_tile_coord_mn) + epi_m, get<1>(epi_tile_coord_mn) + epi_n);
+      for (int i=0; i < cute::ceil_div(FragmentSize, SFVecSize); i++) {
+        int idx = i * SFVecSize + threadIdx.x % SFVecSize;
+        if (idx < FragmentSize && elem_less(tC_cSFD_pred(idx), residue_tC_cSFD)) {
+          UnderlyingElementBlockScaleFactor tmp = frg_scale[idx];
+          // Store the (EpilogueTile / SFVecSize) elements.
+          tC_gSFD_store(idx) = tmp;
+        }
+      }
+
+      /// Step3: Compute quantized output values
+      if constexpr (cute::sizeof_bits_v<ElementOutput> == 4) {
+        return compute_quantized_value(compute_frgs, frg_scale); // ElementCompute
+      }
+      else {
+        // 6bits or 8bits output.
+        compute_frgs = compute_quantized_value(compute_frgs, frg_scale);
+        frg_output = cutlass::NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize>{}(compute_frgs);
+        return frg_output;   // ElementOutput
+      }
+
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [tile_coord_m, tile_coord_n, tile_coord_k, tile_coord_l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig = cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize, UMMA::Major::MN>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[tile_coord_l];
+      tile_coord_l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+    //Tensor gSFD = local_tile(mSFD, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));
+    // Normally, we should use tile_shape_mnk to tile the mSFD tensor. However, we could not do it for non-pow2 cta tile with vectorsize = 32.
+    // For scale factor, 128x4 elements are stored in a basic block, and the layout of mSFD is ((_32,_4,int),(_32,_4,int),int):((_16,_4,int),(_0,_1, int),int)
+    // If we tiled it using tile_shape_mnk(128, 192), the N mode would encounter shape_div failure because (32, 4) could not be divisible by 192.
+    // Therefore, switching to using pow2 epilogue tile.
+    static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_,_,tile_coord_l));                              // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                        gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto epi_tile_coord_mn = make_coord(tile_coord_m * size<0>(epi_tile_mn), tile_coord_n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},tile_coord_l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(make_smem_ptr(smem_aux), make_layout(_4{}));
+#if 0
+    if(threadIdx.x == 128 && blockIdx.x == 0 && blockIdx.y == 0){
+      print("mSFD         ");print(mSFD);       print("\n");
+      print("gSFD         ");print(gSFD);       print("\n");
+      print("tCgSFD       ");print(tCgSFD);     print("\n");
+      print("tCrSFD       ");print(tCrSFD);     print("\n");
+      print("args.tCcD       ");print(args.tCcD);     print("\n");
+      print("args.residue_tCcD       ");print(args.residue_tCcD);     print("\n");
+      print("filter(tCrSFD) ");print(filter(tCrSFD));     print("\n");
+      print("filter(tCgSFD) ");print(filter(tCgSFD));     print("\n");
+    }
+#endif
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      epi_tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down);
+  }
+};
+
 } // namespace cutlass::epilogue::fusion
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
new file mode 100644
index 00000000..8f391aac
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+  \brief Fusion callbacks specializations for the SM120 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sm120 Tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// D = alpha * acc + beta * C
+// With BlockScaleFactor Generation.
+// 1. Find max of 32 F32 elements
+// 2. Convert the max to UE8 (or UE4M3) and store the result.
+// 3. Convert the UE8 (or UE4M3) back to F32 scale.
+// 4. Reciprocal of F32 scale with MUFU.
+// 5. Multiply each F32 element with the above reciprocal, then convert to ElementD
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombRowBlockScaleFactor =
+  Sm90EVT<Sm120BlockScaleFactorRowStore<SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,ElementCompute, ElementBlockScaleFactor, RoundStyle>, // gen scalefactor
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm120LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+
+  using Sm100Fusion = FusionCallbacks<
+        epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+        fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+        CtaTileShapeMNK,
+        EpilogueTile
+  >;
+  using Operation = typename Sm100Fusion::Operation;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per-row bias
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute,
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias) 
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm120LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per_col bias
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per_col bias) 
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+//   with per column blockScaled generation
+// 1. Find max of 32 F32 elements
+// 2. Convert the max to UE8 (or UE4M3) and store the result.
+// 3. Convert the UE8 (or UE4M3) back to F32 scale.
+// 4. Reciprocal of F32 scale with MUFU.
+// 5. Multiply each F32 element with the above reciprocal, then convert to ElementD
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombColBlockScaleFactor = Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle>,
+    Sm90LinearCombination<
+      ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+  epilogue::Sm120TmaWarpSpecialized<
+    StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  fusion::LinCombBlockScaleFactor<
+    SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, 
+    cutlass::layout::ColumnMajor, ElementSource, ElementScalar, RoundStyle>,
+  CtaTileShapeMNK,
+  EpilogueTile
+> : Sm120LinearCombColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl = Sm120LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+
+  using Sm100Fusion = FusionCallbacks<
+        epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+        fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::ColumnMajor,ElementSource, ElementScalar, RoundStyle>,
+        CtaTileShapeMNK,
+        EpilogueTile
+  >;
+  using Operation = typename Sm100Fusion::Operation;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per-Col bias
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+{
+
+  using Impl =
+    Sm120LinCombPerColBiasColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per_col bias)
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      ElementCompute, ElementCompute, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource,
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombPerColBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource,
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {      // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },     // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };       // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn,
+      ElementCompute, ElementCompute, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    > {
+
+
+  using Impl =
+    Sm120LinCombPerRowBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}   // ternary args : multiply_add
+            },     // end ternary op
+            activation // unary args : activation
+          },    // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };    // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+
+// D = alpha * acc + beta * C + per-row bias
+//   with per column blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute,
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
new file mode 100644
index 00000000..59a9d030
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,877 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+  \brief Visitor tree store operations for the SM120 TMA warp-specialized (ws) epilogue
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// BlockScaleFactor Generation Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm120BlockScaleFactorRowStore {
+
+  static_assert(size<1>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<1>(EpilogueTile{}) / SFVecSize == 1 or
+                size<1>(EpilogueTile{}) / SFVecSize == 2 or
+                size<1>(EpilogueTile{}) / SFVecSize == 4 or 
+                size<1>(EpilogueTile{}) / SFVecSize == 8,
+                "Possible store in interleaved 4B aligned format");
+
+  static constexpr int NumWarpgroups = 2;
+  static constexpr int NumSyncWarps = NumWarpsPerWarpGroup * NumWarpgroups;
+  static constexpr int NumQuadsPerWarp = 8;
+  static constexpr int NumSyncQuads = NumSyncWarps * NumQuadsPerWarp;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncQuads> smem_aux;
+  };
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = {};
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = {};
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (N % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm120BlockScaleFactorRowStore] N-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorRowStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorRowStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class TileCoordMN,
+    class ElementType,
+    class TiledCopy_
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,
+          GTensor&& tC_gSFD_,
+          STensor&& sAmaxs_,
+          CoordGTensor tC_cSFD_,
+          ThrResidue residue_tC_cSFD_,
+          Params const* params_ptr_,
+          TileCoordMN tile_coord_mn_,
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_,
+          int thread_idx_,
+          TiledCopy_ const&)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , tile_coord_mn(tile_coord_mn_)
+      , thread_idx(thread_idx_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    TileCoordMN tile_coord_mn;
+    int thread_idx;
+    static constexpr int NumCollaboratingThreads = decltype(size(TiledCopy_{}))::value;
+    static_assert(NumCollaboratingThreads % NumThreadsPerWarpGroup == 0);
+    static constexpr int NumCollaboratingWarpGroups = NumCollaboratingThreads / NumThreadsPerWarpGroup;
+    static_assert(NumCollaboratingWarpGroups == 1 || NumCollaboratingWarpGroups == 2,
+                  "SM120 epilogue currently only supports one or two warp groups collaborating.");
+
+    template <class ElementAccumulator, class ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      return frg_input;
+    }
+
+    template <class SmemTensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(SmemTensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      /*
+      Accumulator fragments are distributed across quads in different warps.
+      For SFVector = 16, we have:
+
+         8 elements          8 elements       8 elements          8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7      Warp 4 Quad 7
+        Warp 0 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7      Warp 4 Quad 7
+
+        <same pattern for warps 1 and 5 for the next set of 16 rows>
+        <same pattern for warps 2 and 6 for the next set of 16 rows>
+        <same pattern for warps 3 and 7 for the next set of 16 rows>
+
+      In this case, row-wise scale factors are cooperatively reduced across 4
+      threads from 1 quad in 1 warp. Each quad computes its own, local absolute
+      maximum without communicating with other warps through shared memory.
+
+      For SFVector = 32, we have:
+         8 elements        8 elements         8 elements         8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 4 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 4 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 4 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7
+        Warp 0 Quad 0      Warp 4 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 4 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 4 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7
+
+        <same pattern for warps 1 and 5 for the next set of 16 rows>
+        <same pattern for warps 2 and 6 for the next set of 16 rows>
+        <same pattern for warps 3 and 7 for the next set of 16 rows>
+
+      For SFVector = 64, we have:
+          8 elements        8 elements         8 elements         8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 2 Quad 0      Warp 4 Quad 0      Warp 6 Quad 0
+        Warp 0 Quad 1      Warp 2 Quad 1      Warp 4 Quad 1      Warp 6 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 2 Quad 7      Warp 4 Quad 7      Warp 6 Quad 7
+        Warp 0 Quad 0      Warp 2 Quad 0      Warp 4 Quad 0      Warp 6 Quad 0
+        Warp 0 Quad 1      Warp 2 Quad 1      Warp 4 Quad 1      Warp 6 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 2 Quad 7      Warp 4 Quad 7      Warp 6 Quad 7
+
+        <same pattern for warps 1, 3, 5 and 7 for the next set of 16 rows>
+
+      Thus, rowwise scale factors are cooperatively reduced across 8 threads
+      from two quads in two warps. Each quad first computes its own, local
+      absolute maximum and then shares this with the corresponding quad in the
+      other warp. In this case, a reduction through shared memory is needed.
+
+      For a non-cooperative epilogue (in which each warpgroup computes a
+      separate tile), the pattern is the same as that above, except that warps 0
+      and 2 are in the same row, and 1 and 3 are in the same row, and warps 4-7
+      are not included.
+      */
+
+      // Accumulator fragments consist of two elements from two different rows of a 16x8 MMA output
+      static constexpr int ColsPerThreadAccFrag = 2;
+      static constexpr int RowsPerThreadAccFrag = 2;
+      static_assert(FragmentSize ==
+                    (ColsPerThreadAccFrag * RowsPerThreadAccFrag));
+
+      static constexpr int NumThreadsPerQuad = 4;
+      static_assert(SFVecSize == 16 || SFVecSize == 32 || SFVecSize == 64, "SF vector size must be either 16, 32 or 64.");
+      // A quad from two or four warps participate in computing each scale factor.
+      constexpr int WarpsPerSF = SFVecSize / 16;
+      static_assert(WarpsPerSF == 1 || WarpsPerSF == 2 || WarpsPerSF == 4, "Only one, two or four warps are allowed in reduction.");
+
+      constexpr bool IsInterWarpReductionNeeded = (WarpsPerSF != 1);
+
+      // Number of fragments for each thread that are needed for computing a scale factor
+      static constexpr int AccFragsPerSF = SFVecSize / (ColsPerThreadAccFrag * NumThreadsPerQuad * WarpsPerSF);
+      static_assert(size<2>(visit_results) % AccFragsPerSF == 0,
+        "Fragments along N mode must be a multiple of the number of accumulator fragments needed per SF");
+
+      auto warp_idx = thread_idx / NumThreadsPerWarp;
+      auto warpgroup_idx = thread_idx / NumThreadsPerWarpGroup;
+      auto quad_idx_in_warp = (thread_idx % NumThreadsPerWarp) / NumThreadsPerQuad;
+      auto thread_idx_in_quad = thread_idx % NumThreadsPerQuad;
+
+      cutlass::maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+      cutlass::multiplies<ElementCompute> mul;
+      
+      Tensor tC_rSFD_flt = filter_zeros(tC_rSFD);
+
+      auto synchronize = [&] () {
+        cutlass::arch::NamedBarrier::sync(NumCollaboratingThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      };
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int sf_id = 0; sf_id < size(tC_rSFD_flt); ++sf_id) {
+
+        auto coord = idx2crd(sf_id, tC_rSFD_flt.shape());
+        auto row_in_acc = get<0,1,1>(coord);
+        auto row = crd2idx(get<1>(coord), get<1>(tC_rSFD_flt.shape()));
+        auto sf = crd2idx(get<2>(coord), get<2>(tC_rSFD_flt.shape()));
+
+        //
+        // Compute amax for this scale factor
+        //
+        ElementCompute amax{0};
+
+        // Compute amax among vals owned by this thread for this vector
+        auto acc_frag_row = row_in_acc * RowsPerThreadAccFrag;
+        auto acc_frag_start_for_sf = sf * AccFragsPerSF;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < AccFragsPerSF; ++i) {
+          auto acc_frg = visit_results(0, row, acc_frag_start_for_sf + i);
+          amax = amax_op(amax, acc_frg[acc_frag_row]);
+          amax = amax_op(amax, acc_frg[acc_frag_row + 1]);
+        }
+
+        // At this point, each thread has computed the amax of the values that it owns for this SF vector.
+        // We now need to compute the amax across threads. Because the TiledMMA uses an MmaThrLayout of <4,1,1>,
+        // we know that all fragments in this row will belong to threads in this warp. Furthermore, because
+        // SM120 narrow-precision MMAs have 16x8 output size with a quad owning two rows, we know that a quad
+        // will own all of the elements to be reduced via amax. Therefore, we can use warp shuffle intrinsics
+        // among threads in one quad to compute the amax.
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < 3; ++i) {
+          auto amax_other = __shfl_xor_sync(0xffffffff, amax, i);
+          amax = amax_op(amax, amax_other);
+        }
+
+        if constexpr (IsInterWarpReductionNeeded) {
+          // At this point, all threads in the quad have the amax for the elements of the accumulator owned by its quad
+          // that should be used in computing the amax for this SF. Threads 0 in each quad of warps 0 and 2
+          // (similarly, 1 and 3) now exchange amaxes to compute the final amax.
+          if (thread_idx_in_quad == 0) {
+            sAmaxs(quad_idx_in_warp, warp_idx) = amax;
+          }
+          synchronize();
+
+          // Get the amax broadcasted by the warp with which we share.
+          // Work on 4 warps per SFD generation
+          if constexpr (WarpsPerSF == 4) {
+            if constexpr (NumCollaboratingWarpGroups == 2) {
+              // This implementation assumes warp layout 2 x 4.
+              // For cooperative kernels (NumCollaboratingWarpGroups=2),
+              // warp 0 shares with 2 / 4 / 6, warp 1 shares with 3 / 5/ 7.
+              auto amax_other2 = sAmaxs(quad_idx_in_warp, warp_idx ^ 2);
+              auto amax_other4 = sAmaxs(quad_idx_in_warp, warp_idx ^ 4);
+              auto amax_other6 = sAmaxs(quad_idx_in_warp, warp_idx ^ 6);
+              synchronize();
+              amax = amax_op(amax, amax_other2);
+              amax = amax_op(amax, amax_other4);
+              amax = amax_op(amax, amax_other6);
+            } 
+            else {
+              static_assert(cutlass::detail::dependent_false<TiledCopy_>, "Unsupported warp layout.");
+            }
+          }
+          // Work on 2 warps per SFD generation
+          else if constexpr(WarpsPerSF == 2) {
+            // For cooperative kernels (NumCollaboratingWarpGroups=2), 0 shares
+            // with 4, 1 shares with 5, etc. For non-cooperative kernels
+            // (NumCollaboratingWarpGroups=1), 0 shares with 2, 1 shares with 3.
+            auto amax_other = sAmaxs(
+                quad_idx_in_warp, warp_idx ^ (1 << NumCollaboratingWarpGroups));
+            synchronize();
+            amax = amax_op(amax, amax_other);
+          }
+        }
+
+        ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
+        ElementBlockScaleFactor qpvscale = NumericConverter<ElementBlockScaleFactor, ElementCompute>{}(pvscale);
+        tC_rSFD_flt(coord) = qpvscale;
+
+        //
+        // Apply the scale factor to the output
+        //
+        ElementCompute qpvscale_rcp = [&]() {
+          if constexpr (cute::is_same_v<ElementBlockScaleFactor, float_ue8m0_t>) {
+            // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+            auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<ElementBlockScaleFactor>{}(qpvscale);
+            return cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+          }
+          else {
+            // UE4M3: Do the rcp in fp32 data type.
+            auto qpvscale_up = cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(qpvscale);
+            return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+          }
+        }();
+
+        ElementCompute acc_scale = mul(norm_constant, qpvscale_rcp);
+        acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+
+        // Compute quantized output values
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < AccFragsPerSF; ++i) {
+          auto acc_frag = visit_results(0, row, acc_frag_start_for_sf + i);
+          visit_results(0, row, acc_frag_start_for_sf + i)[acc_frag_row    ] = mul(acc_frag[acc_frag_row], acc_scale);
+          visit_results(0, row, acc_frag_start_for_sf + i)[acc_frag_row + 1] = mul(acc_frag[acc_frag_row + 1], acc_scale);
+        }
+      } // sf
+
+      // Since scale factors are computed cooperatively across two quads from two warps, we only need one thread from the
+      // set of 8 cooperating threads to write out the data. We do this with thread 0 in each quad of the first warp that collaborates.
+      bool write_sf = (thread_idx_in_quad == 0);
+      if constexpr (NumCollaboratingWarpGroups == 2) {
+        // For cooperative kernels (NumCollaboratingWarpGroups=2), 0 shares with 4, 1 shares with 5, etc.
+        // Thus, only the warps in the first warpgroup need to write out scale factors.
+        if constexpr (IsInterWarpReductionNeeded) {
+          write_sf &= warp_idx < NumWarpsPerWarpGroup;
+        }
+      }
+      else {
+        if constexpr (IsInterWarpReductionNeeded) {
+          // When non-cooperative kernels apply inter warp reduce, they are with
+          // SF output rule as below :
+          // 1. warp 0 shares with 2 and 1 shares with 3 within each warpgroup.
+          // 2. warps 0 and 1 of the first warpgroup and 4 and 5 of the second
+          //   warpgroup need to write output sf.
+          write_sf &= ((warp_idx < 2) || (warpgroup_idx == 1 && warp_idx < 6));
+        }
+      }
+
+      if (write_sf && elem_less(tC_cSFD(_0{}, _0{}, _0{}, epi_m, epi_n), residue_tC_cSFD)) {
+        copy_aligned(tC_rSFD, tC_gSFD(_, _, _, _0{}, _0{}, get<0>(tile_coord_mn) + epi_m, get<1>(tile_coord_mn) + epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig = cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize>;
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(params_ptr->ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+
+    static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));                             // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                        gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<ElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(
+      make_smem_ptr(smem_aux),
+      make_layout(make_shape(Int<NumQuadsPerWarp>{}, Int<NumSyncWarps>{}))
+    );
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down,
+      args.thread_idx,
+      args.tiled_copy);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm120BlockScaleFactorColStore {
+
+  static_assert(size<0>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<0>(EpilogueTile{}) / SFVecSize == 1 or
+                size<0>(EpilogueTile{}) / SFVecSize == 2 or
+                size<0>(EpilogueTile{}) / SFVecSize == 4,
+                "Possible store in interleaved 4B aligned format");
+
+  static constexpr int NumWarpgroups = 2;
+  static constexpr int NumSyncWarps = NumWarpsPerWarpGroup * NumWarpgroups;
+  static constexpr int NumThreadsPerQuad = 4;
+  static constexpr int NumSyncElementsCrossWarp = NumSyncWarps * NumThreadsPerQuad;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncElementsCrossWarp> smem_aux;
+  };
+
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = {};
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = {};
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (M % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm120BlockScaleFactorColStore] N-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorColStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorColStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class TileCoordMN,
+    class ElementType,
+    class TiledCopy_
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,
+          GTensor&& tC_gSFD_,
+          STensor&& sAmaxs_,
+          CoordGTensor tC_cSFD_,
+          ThrResidue residue_tC_cSFD_,
+          Params const* params_ptr_,
+          TileCoordMN tile_coord_mn_,
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_,
+          int thread_idx_,
+          TiledCopy_ const&)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , tile_coord_mn(tile_coord_mn_)
+      , thread_idx(thread_idx_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    TileCoordMN tile_coord_mn;
+    int thread_idx;
+    static constexpr int NumCollaboratingThreads = decltype(size(TiledCopy_{}))::value;
+    static_assert(NumCollaboratingThreads % NumThreadsPerWarpGroup == 0);
+    static constexpr int NumCollaboratingWarpGroups = NumCollaboratingThreads / NumThreadsPerWarpGroup;
+    static_assert(NumCollaboratingWarpGroups == 2,
+                  "SM120 epilogue currently only supports two warp groups collaborating.");
+    static_assert(SFVecSize == 16 || SFVecSize == 32 || SFVecSize == 64, "SF vector size must be either 16, 32 or 64.");
+
+    template <class ElementAccumulator, class ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      return frg_input;
+    }
+
+    template <class SmemTensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(SmemTensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      /*
+      Accumulator fragments are distributed across threads/quads in different warps. For column major, the
+      reduction happens along M dimension. For SFVector = 32, we have:
+
+              8 elements               8 elements             8 elements               8 elements
+      +  <----------------------><----------------------><----------------------><---------------------->
+      |     Warp 0 Quad 0           Warp 4 Quad 0           Warp 0 Quad 0           Warp 4 Quad 0
+      |     Warp 0 Quad 1           Warp 4 Quad 1           Warp 0 Quad 1           Warp 4 Quad 1
+      |     ...                     ...                     ...                     ...
+    1 |     Warp 0 Quad 7           Warp 4 Quad 7           Warp 0 Quad 7           Warp 4 Quad 7
+    6 |     Warp 0 Quad 0           Warp 4 Quad 0           Warp 0 Quad 0           Warp 4 Quad 0
+      |     Warp 0 Quad 1           Warp 4 Quad 1           Warp 0 Quad 1           Warp 4 Quad 1
+      |     ...                     ...                     ...                     ...
+      +     Warp 0 Quad 7           Warp 4 Quad 7           Warp 0 Quad 7           Warp 4 Quad 7
+      |     Warp 1 Quad 0           Warp 5 Quad 0           Warp 1 Quad 0           Warp 5 Quad 0
+      |     Warp 1 Quad 1           Warp 5 Quad 1           Warp 1 Quad 1           Warp 5 Quad 1
+    1 |     ...                     ...                     ...                     ...
+    6 |     Warp 1 Quad 7           Warp 5 Quad 7           Warp 1 Quad 7           Warp 5 Quad 7
+      |     Warp 1 Quad 0           Warp 5 Quad 0           Warp 1 Quad 0           Warp 5 Quad 0
+      |     Warp 1 Quad 1           Warp 5 Quad 1           Warp 1 Quad 1           Warp 5 Quad 1
+      |     ...                     ...                     ...                     ...
+      |     Warp 1 Quad 7           Warp 5 Quad 7           Warp 1 Quad 7           Warp 5 Quad 7
+
+                    <same pattern for warps 2/3 and 6/7 for the next set of 32 rows>
+
+      In this case, colum-wise scale factors are cooperatively reduced across 8 threads from 2 warps.
+      Each column first computes its own, local absolute maximum and then shares this with the
+      corresponding threads in the other warp. In this case, a reduction through shared memory is needed.
+
+      For SFVector = 64, the reduction happens inside 4 warps: warp 0/1/2/3 and warp 4/5/6/7.
+      */
+
+      // Accumulator fragments consist of two elements from two different columns of a 16x8 MMA output
+      static constexpr int RowsPerThreadAccFrag = 2;
+      static constexpr int ColsPerThreadAccFrag = 2;
+      static_assert(FragmentSize == (ColsPerThreadAccFrag * RowsPerThreadAccFrag));
+
+      static constexpr int NumThreadsPerCol = NumThreadsPerWarp / NumThreadsPerQuad;
+      constexpr int WarpsPerSF = SFVecSize / NumThreadsPerCol / ColsPerThreadAccFrag;
+      static_assert(WarpsPerSF == 1 || WarpsPerSF == 2 || WarpsPerSF == 4, "Only one, two or four warps are allowed in reduction.");
+
+      auto warp_idx = thread_idx / NumThreadsPerWarp;
+      auto thread_idx_in_warp = thread_idx % NumThreadsPerWarp;
+
+      cutlass::maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+      cutlass::multiplies<ElementCompute> mul;
+
+      auto synchronize = [&] () {
+        // When WarpsPerSF equals 1, data processing is inside warp, there is no needs to have the sync.
+        static constexpr bool NoSyncNeeded = (WarpsPerSF == 1);
+        if(NoSyncNeeded)
+          return;
+        cutlass::arch::NamedBarrier::sync(NumCollaboratingThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      };
+
+      CUTLASS_PRAGMA_UNROLL
+      for(int mma_in_epi = 0; mma_in_epi < size<1>(tC_rSFD)*size<2>(tC_rSFD); ++mma_in_epi) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int sf_id = 0; sf_id < ColsPerThreadAccFrag; ++sf_id) {
+
+          //
+          // Compute amax for this scale factor
+          //
+          ElementCompute amax{0};
+
+          // Compute amax among vals owned by this thread for this vector
+          auto acc_frg = visit_results(mma_in_epi);
+          amax = amax_op(amax, acc_frg[sf_id]);
+          amax = amax_op(amax, acc_frg[sf_id + ColsPerThreadAccFrag]);
+
+          // At this point, each thread has computed the amax of the values that it owns for this SF vector.
+          // We now need to compute the amax across threads. Because SM120 narrow-precision MMAs have 16x8 output
+          // size with a quad owning two rows, we know that 8 threads in one column will own all of the 16 elements
+          // to be reduced via amax. Therefore, we can use warp shuffle intrinsics among threads to compute the amax.
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < NumThreadsPerCol; ++i) {
+            auto amax_other = __shfl_xor_sync(0xffffffff, amax, (i * NumThreadsPerQuad));
+            amax = amax_op(amax, amax_other);
+          }
+
+          // At this point, all threads in the quad have the amax for the elements of the accumulator owned by its
+          // threads that should be used in computing the amax for this SF.
+          if (thread_idx_in_warp < NumThreadsPerQuad && WarpsPerSF != 1) {
+            sAmaxs(thread_idx_in_warp, warp_idx) = amax;
+          }
+
+          synchronize();
+
+          // Get the amax broadcasted by the warp with which we share.
+          // For cooperative kernels, when scale factor vector size is 32 (WarpsPerSF equals 2),
+          // warp 0 shares with 1, warp2 shares with 2, etc.
+          // When vector size is 64 (WarpsPerSF equals 4), warp 0 shares with 1/2/3, and 4 shares with 5/6/7.
+          // When vector size is 16, no needs to swap between warps.
+          if constexpr (2 == WarpsPerSF) {
+            auto amax_other = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 1);
+            amax = amax_op(amax, amax_other);
+          }
+          else if constexpr (4 == WarpsPerSF) {
+            auto amax_other1 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 1);
+            auto amax_other2 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 2);
+            auto amax_other3 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 3);
+            amax = amax_op(amax, amax_other1);
+            amax_other2 = amax_op(amax_other2, amax_other3);
+            amax = amax_op(amax, amax_other2);
+          }
+          synchronize();
+
+          ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
+          ElementBlockScaleFactor qpvscale = NumericConverter<ElementBlockScaleFactor, ElementCompute>{}(pvscale);
+          filter(tC_rSFD)(sf_id + mma_in_epi*ColsPerThreadAccFrag) = qpvscale;
+
+          //
+          // Apply the scale factor to the output
+          //
+          ElementCompute qpvscale_rcp = [&]() {
+            if constexpr (cute::is_same_v<ElementBlockScaleFactor, float_ue8m0_t>) {
+              // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+              auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<ElementBlockScaleFactor>{}(qpvscale);
+              return cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+            }
+            else {
+              // UE4M3: Do the rcp in fp32 data type.
+              auto qpvscale_up = cutlass::NumericConverter<ElementCompute, ElementBlockScaleFactor>{}(qpvscale);
+              return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+            }
+          }();
+
+          ElementCompute acc_scale = mul(norm_constant, qpvscale_rcp);
+          acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+
+          // Compute quantized output values
+          visit_results(mma_in_epi)[sf_id                       ] = mul(acc_frg[sf_id                       ], acc_scale);
+          visit_results(mma_in_epi)[sf_id + ColsPerThreadAccFrag] = mul(acc_frg[sf_id + ColsPerThreadAccFrag], acc_scale);
+        } // end for sf_id
+      } // end for mma_in_epi
+
+      // Since scale factors are computed cooperatively across two or four warps, we only need one thread from the
+      // cooperating column threads group to write out the data.
+      bool write_sf = (thread_idx_in_warp < NumThreadsPerQuad);
+      if constexpr (2 == WarpsPerSF) {
+        // Output warp {0, 2, 4, 6}.
+        write_sf &= ((warp_idx & 0x1) == 0);
+      }
+      else if constexpr (4 == WarpsPerSF) {
+        // Output warp {0, 4}.
+        write_sf &= ((warp_idx & 0x3) == 0);
+      }
+      else if constexpr (1 == WarpsPerSF) {
+        // Output warp {0, 1, ..., 7}. Keep write_sf as is.
+      }
+
+      if (write_sf && elem_less(tC_cSFD(_0{}, _0{}, _0{}, epi_m, epi_n), residue_tC_cSFD)) {
+        copy_aligned(tC_rSFD, tC_gSFD(_, _, _, _0{}, _0{}, get<0>(tile_coord_mn) + epi_m, get<1>(tile_coord_mn) + epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize, UMMA::Major::MN>;
+
+    static_assert(size<0>(EpilogueTile{}) && ((size<0>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0),
+      "Epilogue Tile N should be pow of 2");
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(params_ptr->ptr_scale_factor),
+                    Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));               // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                      gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<ElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(
+      make_smem_ptr(smem_aux),
+      make_layout(make_shape(Int<NumThreadsPerQuad>{}, Int<NumSyncWarps>{}))
+    );
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down,
+      args.thread_idx,
+      args.tiled_copy);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
index cd8ff0fa..c498a382 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -1043,9 +1043,9 @@ struct Sm90RowBroadcast {
     }
     // Dynamic non-batched scalar broadcast
     else if (IsDynamicBroadcast && stride_N == bool(0) && stride_L == repeat_like(stride_L, 0)) {
-      if constexpr (!IsArrayOfPointers) {
-        is_zero_ = params.ptr_row[0] == ElementInput(0);
-      }
+       if constexpr (!IsArrayOfPointers) {
+         is_zero_ = params.ptr_row[0] == ElementInput(0);
+       }
     }
   }
 
@@ -1322,9 +1322,9 @@ struct Sm90ColBroadcast {
     }
     // Dynamic non-batched scalar broadcast
     else if (IsDynamicBroadcast && stride_M == bool(0) && stride_L == repeat_like(stride_L, 0)) {
-      if constexpr (!IsArrayOfPointers) {
-        is_zero_ = params.ptr_col[0] == ElementInput(0);
-      }
+       if constexpr (!IsArrayOfPointers) {
+         is_zero_ = params.ptr_col[0] == ElementInput(0);
+       }
     }
   }
 
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
index de4bdbf2..4a8e5f8c 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -680,13 +680,13 @@ public:
 
   struct Arguments {
     void* ptr_row = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
-    ElementCompute reduction_identity = 0;
+    ElementCompute reduction_identity = ElementCompute(0);
     StrideMNL dRow = {};
   };
 
   struct Params {
     void* ptr_row = nullptr;
-    ElementCompute reduction_identity = 0;
+    ElementCompute reduction_identity = ElementCompute(0);
     StrideMNL dRow = {};
     ElementCompute* reduction_buffer = nullptr;
     int* tile_counters = nullptr;
@@ -1267,13 +1267,13 @@ public:
 
   struct Arguments {
     void* ptr_col = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
-    ElementCompute reduction_identity = 0;
+    ElementCompute reduction_identity = ElementCompute(0);
     StrideMNL dCol = {};
   };
 
   struct Params {
     void* ptr_col = nullptr;
-    ElementCompute reduction_identity = 0;
+    ElementCompute reduction_identity = ElementCompute(0);
     StrideMNL dCol = {};
     ElementCompute* reduction_buffer = nullptr;
     int* tile_counters = nullptr;
diff --git a/include/cutlass/exmy_base.h b/include/cutlass/exmy_base.h
index 3215d55a..be207a49 100644
--- a/include/cutlass/exmy_base.h
+++ b/include/cutlass/exmy_base.h
@@ -90,7 +90,7 @@ enum class FpEncoding
 
 #if (CUTLASS_CXX17_OR_LATER)
 template<uint32_t NumExpBits, uint32_t NumMantissaBits>
-CUTLASS_CONSTEXPR_IF_CXX17 int exponent_bias_cxx17() {
+constexpr int exponent_bias_cxx17() {
   if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
     static_assert(NumMantissaBits <= static_cast<uint32_t>(cutlass::platform::numeric_limits<int32_t>::max()));
     return -1 * static_cast<int>(NumMantissaBits);
@@ -98,13 +98,15 @@ CUTLASS_CONSTEXPR_IF_CXX17 int exponent_bias_cxx17() {
   else {
     return static_cast<int>((1 << (NumExpBits - 1))) - 1;
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
 namespace impl {
 template<uint32_t NumExpBitsMinusOne>
 constexpr int shift_num_bits_expression_cxx11() {
-#if (__cplusplus >= 201700L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201700L))
+#if (CUTLASS_CXX17_OR_LATER)
   static_assert(NumExpBitsMinusOne <= 31u);
 #endif
   return NumExpBitsMinusOne > 31u ? 31u : NumExpBitsMinusOne;
@@ -120,7 +122,7 @@ constexpr int inner_shift_expression_cxx11() {
 // C++11 equivalent of exponent_bias_cxx17()
 template<uint32_t NumExpBits, uint32_t NumMantissaBits>
 constexpr int exponent_bias_cxx11() {
-#if (__cplusplus >= 201700L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201700L))
+#if (CUTLASS_CXX17_OR_LATER)
   return exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
 #else
   return (NumExpBits == 0) ?
@@ -151,12 +153,12 @@ constexpr int maximum_exponent_cxx11() {
 template<uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
 constexpr int maximum_exponent_cxx17() {
   constexpr int exp_bias = exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
-  if constexpr (NumExpBits == 0) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
     // If no exponent bits, return fixed hidden bias
     return 0 - exp_bias;
   }
   else {
-    if constexpr (NaNEncoding == NanInfEncoding::IEEE_754) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::IEEE_754) {
       // We have IEEE style NaN and infinity
       // All values when exp_bits = 1...1s are used.
       int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 2;
@@ -168,8 +170,8 @@ constexpr int maximum_exponent_cxx17() {
       // If we have a canonical NaN. Only exp=1..1 and mantissa=1..1
       // value has a special meaning. If we also have at least one mantissa
       // bit, then maximum exponent is 1...1 - exponent_bias
-      if constexpr (NaNEncoding == NanInfEncoding::CANONICAL_ONLY) {
-        if constexpr (NumMantissaBits > 0) {
+      if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::CANONICAL_ONLY) {
+        if CUTLASS_CONSTEXPR_IF_CXX17 (NumMantissaBits > 0) {
           int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 1;
           return max_exp_bits - exp_bias;
         }
@@ -183,6 +185,8 @@ constexpr int maximum_exponent_cxx17() {
       return max_exp_bits - exp_bias;
     }
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
@@ -228,7 +232,7 @@ template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEnc
 constexpr Storage max_pos_denormal_value_cxx17() {
   static_assert(NumExpBits > 0 || NumMantissaBits > 0, "Both NumExpBits and NumMantissaBits can't be zero");
   constexpr bool has_denorm = (NumMantissaBits > 0);
-  if constexpr (!has_denorm) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (!has_denorm) {
     // If we don't have denormal values, return all 0s
     return Storage(0);
   }
@@ -236,6 +240,8 @@ constexpr Storage max_pos_denormal_value_cxx17() {
     // Case: (NumExpBits > 0 && NumMantissaBits > 0) or (NumExpBits == 0 && NumMantissaBits > 0)
     return Storage((1ull << NumMantissaBits) - 1);
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
@@ -249,7 +255,7 @@ constexpr Storage min_pos_denormal_value_cxx11() {
 template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
 constexpr Storage min_pos_denormal_value_cxx17() {
   constexpr bool has_denorm = (NumMantissaBits > 0);
-  if constexpr (!has_denorm) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (!has_denorm) {
     // If we don't have denormal values, return all 0s
     return Storage(0);
   }
@@ -278,7 +284,7 @@ constexpr Storage max_pos_normal_value_cxx11() {
 #if (CUTLASS_CXX17_OR_LATER)
 template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
 constexpr Storage max_pos_normal_value_cxx17() {
-  if constexpr (NumExpBits == 0) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
     // if there are no exponent bits, we don't have normal values.
     return Storage(0);
   }
@@ -289,12 +295,12 @@ constexpr Storage max_pos_normal_value_cxx17() {
   // place the exponent
   Storage val = static_cast<Storage>(exp) << NumMantissaBits;
   // If there are no mantissa bits return the exponent
-  if constexpr (NumMantissaBits == 0) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumMantissaBits == 0) {
     return val;
   }
   else {
     // If the NaN Inf encoding follows IEEE 754 or there is no (NaN and Inf) then mantissa can be all 1..1s
-    if constexpr (NaNEncoding == NanInfEncoding::IEEE_754 ||
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::IEEE_754 ||
                   NaNEncoding == NanInfEncoding::NONE  ) {
       Storage mantissa = (1ull << NumMantissaBits) - 1;
       val |= mantissa;
@@ -307,6 +313,8 @@ constexpr Storage max_pos_normal_value_cxx17() {
     }
     return val;
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
@@ -324,12 +332,12 @@ template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEnc
 constexpr Storage min_pos_normal_value_cxx17() {
   constexpr bool has_denorm = (NumMantissaBits > 0);
 
-  if constexpr (NumExpBits == 0) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
     // if there are no exponent bits, we don't have normal values.
     return Storage(0);
   }
   Storage exp = 0;
-  if constexpr (has_denorm) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (has_denorm) {
     exp = 1;
   }
   return static_cast<Storage>(exp << NumMantissaBits);
@@ -349,12 +357,14 @@ constexpr Storage max_value_cxx11() {
 template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
 constexpr Storage max_value_cxx17() {
   constexpr bool has_normal = (NumExpBits > 0);
-  if (has_normal) {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (has_normal) {
     return max_pos_normal_value_cxx17<Storage, NumExpBits, NumMantissaBits, NaNEncoding>();
   }
   else {
     return max_pos_denormal_value_cxx17<Storage, NumExpBits, NumMantissaBits, NaNEncoding>();
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
@@ -376,6 +386,8 @@ constexpr Storage min_value_cxx17() {
   else { // Unsigned number
     return Storage(0);
   }
+
+  CUTLASS_GCC_UNREACHABLE;
 }
 #endif
 
@@ -388,7 +400,7 @@ public:
 
   using Storage = StorageType;
 
-#if (201700L <= __cplusplus)
+#if (CUTLASS_CXX17_OR_LATER)
   static_assert(cutlass::platform::is_unsigned_v<Storage>, "Use an unsigned integer for StorageType");
 #endif
   static constexpr bool IS_SIGNED = IsSigned;
@@ -597,6 +609,8 @@ private:
       // If INF is not defined assume satfinite behavior
       return (sign == ZERO) ? MAX_VALUE : MIN_VALUE;
     }
+
+    CUTLASS_GCC_UNREACHABLE;
   }
 
   CUTLASS_HOST_DEVICE
@@ -607,6 +621,8 @@ private:
     else {
       return (ONE << Storage(NUM_MANTISSA_BITS)) | mantissa_bits(flt);
     }
+
+    CUTLASS_GCC_UNREACHABLE;
   }
 
   template<typename T>
@@ -670,7 +686,6 @@ private:
     using DstT = typename DstFpBits::Storage;
     using LargeStorage = typename cutlass::platform::conditional<(sizeof(SrcT) > sizeof(DstT)), SrcT, DstT>::type;
 
-
     LargeStorage src_sign_bit = src_encoding.sign_bit(src_val);
 
     // If the source is NaN, set the destination to NaN carrying the sign bit
@@ -718,7 +733,6 @@ private:
     // Return positive/negative infinity.
     // If no INF is defined, return positive/negative largest value.
     if (src_exp > DstFpBits::MAX_EXP) {
-
       return dst_encoding.set_sign_bit(DstFpBits::INF_MASK, DstT(src_sign_bit));
     }
     else if (src_exp <= DstFpBits::MAX_EXP && src_exp >= DstFpBits::MIN_EXP) {
@@ -748,7 +762,6 @@ private:
 #endif
 
       if (dst_encoding.significand_hidden_bits(dst_mantissa) > 0b1) {
-
         // Significant became larger than 01.X...X. Divide significand by 2 and multiply exp by 2
         while (dst_exponent < (DstFpBits::MAX_EXP+DstFpBits::EXP_BIAS) &&
                dst_encoding.significand_hidden_bits(dst_mantissa) > LargeStorage(0b1)) {
@@ -869,9 +882,7 @@ CUTLASS_CONSTEXPR_IF_CXX17 auto fp_encoding_selector() {
   else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E2M1)   {   // FP4
     return cutlass::detail::FpBitRepresentation<uint8_t, 4, 2, 1, cutlass::detail::NanInfEncoding::NONE>{};
   }
-  else {
-    CUTLASS_GCC_UNREACHABLE;
-  }
+  CUTLASS_GCC_UNREACHABLE;
 }
 
 #else
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 279c3aa6..ac73bf0b 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -159,7 +159,7 @@ template <typename value_t>
 CUTLASS_HOST_DEVICE
 CUTLASS_CONSTEXPR_IF_CXX17
 value_t abs_for_integer(value_t a) {
-  return ((a > 0) ? a : -a);
+  return ((a > value_t{0}) ? a : -a);
 }
 /**
  * Greatest common divisor
@@ -169,9 +169,9 @@ CUTLASS_HOST_DEVICE
 CUTLASS_CONSTEXPR_IF_CXX17
 value_t gcd(value_t a, value_t b) {
   for (;;) {
-    if (a == 0) return cutlass::abs_for_integer(b);
+    if (a == value_t{0}) return cutlass::abs_for_integer(b);
     b %= a;
-    if (b == 0) return cutlass::abs_for_integer(a);
+    if (b == value_t{0}) return cutlass::abs_for_integer(a);
     a %= b;
   }
 }
@@ -184,7 +184,7 @@ CUTLASS_HOST_DEVICE
 CUTLASS_CONSTEXPR_IF_CXX17
 value_t lcm(value_t a, value_t b) {
   value_t temp = cutlass::gcd(a, b);
-  return (temp != 0) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
+  return (temp != value_t{0}) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
 }
 
 /**
@@ -194,7 +194,7 @@ template <typename value_t>
 CUTLASS_HOST_DEVICE
 CUTLASS_CONSTEXPR_IF_CXX17
 value_t gcd_cxx11(value_t a, value_t b) {
-  return (a == 0 || b == 0) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
+  return (a == value_t{0} || b == value_t{0}) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
 }
 
 /**
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index 34acd40a..343a22a4 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -58,7 +58,8 @@
 #endif // defined(__CUDA_ARCH__)
 
 
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUDA_PTX_UE8M0_CVT_ENABLED 1
 #endif
 
diff --git a/include/cutlass/float_subbyte.h b/include/cutlass/float_subbyte.h
index 15e539a8..b6c7a891 100644
--- a/include/cutlass/float_subbyte.h
+++ b/include/cutlass/float_subbyte.h
@@ -44,7 +44,8 @@
 #define CUDA_FP4_ENABLED 1
 #endif
 
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
 #  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
 #endif
 #include "cutlass/cutlass.h"
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index ecbcdff2..2d7a5739 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -69,7 +69,7 @@ namespace detail {
     return __popc(x);
     #elif defined(__GNUC__) || defined(__clang__)
     return __builtin_popcount(x);
-    #elif defined(_MSC_VER)
+    #elif (defined(_MSC_VER) && !defined(_M_ARM64))
     return __popcnt(x);
     #else
     int32_t count = 0;
@@ -86,7 +86,7 @@ namespace detail {
     return __popcll(x);
     #elif defined(__GNUC__) || defined(__clang__)
     return __builtin_popcountll(x);
-    #elif defined(_MSC_VER)
+    #elif (defined(_MSC_VER) && !defined(_M_ARM64))
     return __popcnt64(x);
     #else
     int64_t count = 0;
diff --git a/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
index 0ddec554..d2650371 100644
--- a/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl
@@ -47,7 +47,7 @@ template<
   int CapacityBytes,
   class CtaTileShape_MNK,
   class TiledMma,
-  class KernelScheduleType,
+  class BuilderScheduleTag,
   UMMA::Major UmmaMajorA,
   int ComplexComponent = 1,
   int NumComputeMtxs = 3,
@@ -63,7 +63,7 @@ sm100_compute_stage_count_or_override_fast_fp32(StageCountAutoCarveout<carveout_
   // Detect 2x2 TMEM layout
   constexpr int TmemAccWordsPerDP = (CtaM == 64 && size(AtomThrID{}) == 2) ? CtaN/2 : CtaN;
   constexpr int TmemAWordsPerDP = ComplexComponent * NumComputeMtxs * CtaK / 2;
-  constexpr bool IsAComputeinTmem = UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>;
+  constexpr bool IsAComputeinTmem = UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>;
   constexpr bool IsAComputeinSmem = !IsAComputeinTmem;
   constexpr int AccumulatorStageCount = (IsAComputeinTmem) ? (((TmemAccWordsPerDP * ComplexComponent == 128) ? 2 : 3) * ComplexComponent) : (512 / TmemAccWordsPerDP);
   
@@ -112,7 +112,7 @@ template <
   class TileShape_MNK,  // The Cluster-level TileShape
   class ClusterShape_MNK,
   class StageCountType,
-  class KernelScheduleType
+  class BuilderScheduleTag
 >
 struct CollectiveBuilder<
     arch::Sm100,
@@ -127,10 +127,10 @@ struct CollectiveBuilder<
     TileShape_MNK,    // (MmaAtomShapeM, MmaAtomShapeN, TileK)
     ClusterShape_MNK, // Static cluster shape or dynamic (int, int, int)
     StageCountType,
-    KernelScheduleType,
+    BuilderScheduleTag,
     cute::enable_if_t<
       (not cute::is_tuple<GmemLayoutATag>::value && not cute::is_tuple<GmemLayoutBTag>::value) &&
-      (cute::is_base_of_v<KernelScheduleSm100FastFP32Gemm, KernelScheduleType>) &&
+      (cute::is_base_of_v<KernelScheduleSm100FastFP32Gemm, BuilderScheduleTag>) &&
       ((sizeof(float) * AlignmentA) % detail::tma_alignment_bytes == 0) &&
       ((sizeof(float) * AlignmentB) % detail::tma_alignment_bytes == 0)>>
 {
@@ -143,7 +143,7 @@ struct CollectiveBuilder<
   using ElementBMma = cutlass::bfloat16_t;
   static constexpr int ScalingFactor = 8;
 
-  using TiledMma = decltype(detail::sm100_make_trivial_fastFP32_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, ScalingFactor, KernelScheduleType>());
+  using TiledMma = decltype(detail::sm100_make_trivial_fastFP32_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, ScalingFactor, BuilderScheduleTag>());
   using AtomThrID = typename TiledMma::AtomThrID;
   using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
   using CtaTileShape_MNK = decltype(shape_div(TileShape_MNK{}, AtomThrShapeMNK{}));
@@ -172,7 +172,7 @@ struct CollectiveBuilder<
   static constexpr int MMA_M = cute::size<0,0>(MmaShapeA_MK{});
   using CopyAtomPairA = cutlass::gemm::collective::detail::CollectiveMmaEmulatedCopyType<
     Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementA>,
-    cute::conditional_t<(UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>),
+    cute::conditional_t<(UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>),
                         cute::conditional_t<(MMA_M == 64 && size(AtomThrID{}) == 1), SM100_TMEM_STORE_16dp256b1x, SM100_TMEM_STORE_32dp32b8x>, // TS Implementation
                         Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementA>>                                         // SS Implementation
   >;
@@ -200,7 +200,7 @@ struct CollectiveBuilder<
   static constexpr int NumBandsToCompute = 5;
   static constexpr int AccPromotionInterval = 1;
   static constexpr int SchedulerPipelineStageCount = 3;
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_base_of_v<KernelScheduleSm100PtrArrayFastFP32Gemm, KernelScheduleType>);
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_base_of_v<KernelScheduleSm100PtrArrayFastFP32Gemm, BuilderScheduleTag>);
 
   // CLCPipeline = PipelineCLCFetchAsync
   static constexpr auto CLCPipelineStorage = sizeof(typename cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape_MNK>::SharedStorage);
@@ -226,7 +226,7 @@ struct CollectiveBuilder<
   // Reduce SMEM capacity available for buffers considering extra B smem and barrier smem allocations
   static constexpr int Sm100ReducedSmemCapacityBytes = detail::sm100_smem_capacity_bytes - KernelSmemCarveout;
   static constexpr auto stage_info = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_fast_fp32<
-    Sm100ReducedSmemCapacityBytes, CtaTileShape_MNK, TiledMma, KernelScheduleType, UmmaMajorA>(StageCountType{});
+    Sm100ReducedSmemCapacityBytes, CtaTileShape_MNK, TiledMma, BuilderScheduleTag, UmmaMajorA>(StageCountType{});
   
   static constexpr int Load2TransformPipelineStageCount = get<0>(stage_info);
   static constexpr int Transform2MmaPipelineStageCount = get<1>(stage_info);
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockscaled_sparse_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockscaled_sparse_umma_builder.inl
new file mode 100644
index 00000000..7ebc9b8b
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm100_blockscaled_sparse_umma_builder.inl
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm100_common.inl"
+#include "cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template <
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class TileShapeSFA,
+  class TileShapeSFB,
+  class ClusterShapeMNK,
+  uint32_t AccumulatorPipelineStageCount,
+  uint32_t SchedulerPipelineStageCount,
+  int stages
+>
+constexpr int
+sm100_compute_stage_count_or_override_blockscaled_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template <
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class TileShapeSFA,
+  class TileShapeSFB,
+  class ClusterShapeMNK,
+  uint32_t AccumulatorPipelineStageCount,
+  uint32_t SchedulerPipelineStageCount,
+  int carveout_bytes
+>
+constexpr int
+sm100_compute_stage_count_or_override_blockscaled_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // For MXF8F6F4 and MXF4NVF4 kernels, ElementAMma will be passed in as sparse_elem<uint8_t, Sparsity>, and
+  //    ElementB will be passed in as uint8_t
+
+  constexpr uint32_t KernelSmemCarveout = detail::Sm100SparseGemmTmaUmmaCarveout<
+                                            ClusterShapeMNK,
+                                            AccumulatorPipelineStageCount,
+                                            SchedulerPipelineStageCount,
+                                            detail::CLCResponseSize>::KernelSmemCarveout;
+
+  // * Compute Stage
+  // Each stage include (CollectiveMma::SharedStorage)
+  // 1. smem for A,B,E (CollectiveMma::SharedStorage::TensorStorage)
+  // 2. one MainloopPipeline = (CollectiveMma::SharedStorage::PipelineStorage = PipelineTmaSparseUmmaAsync, two arrive-wait barrier)
+  // 3. smem for SFB and smem for SFB (CollectiveMma::SharedStorage::TensorStorage, independent of input size b.c. sizeof(sf) is fixed)
+  constexpr auto MainloopPipelineStorage_per_Stage_aligned = static_cast<int>(cutlass::round_up(sizeof(cutlass::arch::ClusterBarrier) * 2, 16));
+
+  // a_bits, e_bits already consider sparsity through `sizeof_bits(ElementAMma)
+  // NOTE: sizeof_bits<sparse_elem<>> return integral_ratio instead of size_t
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr auto stage_sfa_bytes = cute::size(filter_zeros(TileShapeSFA{}));
+  constexpr auto stage_sfb_bytes = cute::size(filter_zeros(TileShapeSFB{}));
+
+  constexpr int MainloopTensorStorage_per_Stage =
+    cutlass::round_up(
+      cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      stage_sfa_bytes +
+      stage_sfb_bytes,
+    128);
+
+  constexpr auto EpilogueSharedStorage = carveout_bytes;
+
+  constexpr auto Stages = (cutlass::gemm::collective::detail::sm100_smem_capacity_bytes - KernelSmemCarveout - EpilogueSharedStorage) / 
+    (MainloopTensorStorage_per_Stage + MainloopPipelineStorage_per_Stage_aligned);
+
+  return Stages;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementPairA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementPairB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,        // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ClusterShape_MNK,     // Static cluster shape or dynamic (int, int, _1)
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm100,
+    arch::OpClassBlockScaledSparseTensorOp,
+    ElementPairA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementPairB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      // Blockscaled Sparse Gemm
+      cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag>
+       &&
+      // Alignment check
+      detail::sm1xx_blockscaled_sparse_gemm_is_aligned<typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type,
+                                                       AlignmentA,
+                                                       GmemLayoutATag,
+                                                       typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type,
+                                                       AlignmentB,
+                                                       BuilderScheduleTag>()>>
+{
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  using ElementSF = ElementSFA;
+
+  static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
+  static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+  static_assert(detail::blockscaled::check_input_datatypes<BuilderScheduleTag, ElementPairA, ElementPairB, UmmaMajorA, UmmaMajorB>(), "Incorrect input types");
+
+  static constexpr bool is_2sm = detail::blockscaled::is_2sm<TileShape_MNK, ClusterShape_MNK, BuilderScheduleTag>();
+  static constexpr auto Instr = detail::blockscaled::select_instr<ElementPairA,
+                                                                  ElementPairB,
+                                                                  ElementAccumulator,
+                                                                  UmmaMajorA,
+                                                                  UmmaMajorB,
+                                                                  BuilderScheduleTag>();
+  static constexpr bool UseMxf8f6f4 = Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+  static_assert(UseMxf8f6f4 || (cutlass::gemm::detail::is_k_major_A<GmemLayoutATag>() && cutlass::gemm::detail::is_k_major_B<GmemLayoutBTag>()), "Only MMA.MXF8F6F4 supports non-K major inputs");
+
+  // Data type used by MMA instruction
+  using ElementAMmaRaw = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA, UseMxf8f6f4>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB, UseMxf8f6f4>());
+
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMmaRaw, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, true /*is_sparse*/, is_2sm>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  using TiledMma = typename cutlass::gemm::collective::detail::TrivialBlockscaledMma<ElementPairA, ElementPairB, ElementAccumulator,
+                                                                  TileShape_MNK, ClusterShape_MNK,
+                                                                  UmmaMajorA, UmmaMajorB, Instr, BuilderScheduleTag, is_2sm>::type;
+
+  static constexpr uint32_t SFVectorSize = TiledMma::SFVecSize;
+
+  // Basic storage block for new Scaling Factor Layouts
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVectorSize>;
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using AtomThrID = typename TiledMma::AtomThrID;
+  using Sm1xxSparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma, GmemLayoutATag, ElementEMma>;
+
+  // For both MXF4NVF4 and MXF8F6F4 instructions underlying SMEM allocation type for A is uint8
+  using ElementAMmaRaw_SmemAllocType = uint8_t;
+  using ElementAMma_SmemAllocType = cute::sparse_elem<ElementAMmaSparsity{}, ElementAMmaRaw_SmemAllocType>;
+  using ElementBMma_SmemAllocType = cute::conditional_t<UseMxf8f6f4, uint8_t, ElementB>;
+
+  using LayoutA = decltype(Sm1xxSparseConfig::deduce_layoutA());
+  using LayoutE = decltype(Sm1xxSparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  // ((MMA_TILE_M,MMA_TILE_K), MMA_M, MMA_K)
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(cute::size<0>(TileShape_MNK{}),
+                                                                         cute::size<2>(TileShape_MNK{}))));
+  // ((MMA_TILE_N,MMA_TILE_K), MMA_N, MMA_K)
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(cute::size<1>(TileShape_MNK{}),
+                                                                         cute::size<2>(TileShape_MNK{}))));
+
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_A(
+      ClusterShape_MNK{}, AtomThrID{}));
+
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_B(
+      ClusterShape_MNK{}, AtomThrID{}));
+
+  using GmemTiledCopySFA = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_A(
+      ClusterShape_MNK{}, AtomThrID{}));
+
+  using GmemTiledCopySFB = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_SFB(
+      ClusterShape_MNK{}, AtomThrID{})); 
+
+  using GmemTiledCopyPairA = decltype(cute::make_tuple(GmemTiledCopyA{}, GmemTiledCopySFA{}));
+  using GmemTiledCopyPairB = decltype(cute::make_tuple(GmemTiledCopyB{}, GmemTiledCopySFB{}));
+
+  //
+  // Construct SMEM layout (SmemLayoutAtom) for A and SFA
+  //
+  using BlockTileA_M = decltype(cute::size<0,0>(MmaShapeA_MK{}) * cute::size<1>(MmaShapeA_MK{}));
+  using BlockTileA_K = decltype(cute::size<0,1>(MmaShapeA_MK{}) * cute::size<2>(MmaShapeA_MK{}));
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::sm100_smem_selector_sparse<
+      UmmaMajorA, ElementAMmaRaw_SmemAllocType, BlockTileA_M, BlockTileA_K, ElementAMmaSparsity>());
+
+  // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
+  // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+  using SmemLayoutAtomSFA = decltype(Sm1xxBlkScaledConfig::deduce_smem_layoutSFA(TiledMma{}, TileShape_MNK{}));
+  using SmemLayoutAtomsA = decltype(cute::make_tuple(SmemLayoutAtomA{}, SmemLayoutAtomSFA{}));
+
+  //
+  // Construct SMEM layout (SmemLayoutAtom) for B and SFB
+  //
+  using BlockTileB_N = decltype(cute::size<0,0>(MmaShapeB_NK{}) * cute::size<1>(MmaShapeB_NK{}));
+  using BlockTileB_K = decltype(cute::size<0,1>(MmaShapeB_NK{}) * cute::size<2>(MmaShapeB_NK{}));
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::sm100_smem_selector<
+      UmmaMajorB, ElementBMma_SmemAllocType, BlockTileB_N, BlockTileB_K>());
+  using SmemLayoutAtomSFB = decltype(Sm1xxBlkScaledConfig::deduce_smem_layoutSFB(TiledMma{}, TileShape_MNK{}));
+  using SmemLayoutAtomsB = decltype(cute::make_tuple(SmemLayoutAtomB{}, SmemLayoutAtomSFB{}));
+
+  //
+  // Construct Strides for A, SFA, B, and SFB
+  //
+  using StridePairA = decltype(cute::make_tuple(LayoutA{}, LayoutE{}, Sm1xxBlkScaledConfig::deduce_layoutSFA()));
+  using StridePairB = decltype(cute::make_tuple(cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFB()));
+
+  // Calculate SMEM matrix A and B buffers' pipeline stages and the accumulator stages.
+  static constexpr uint32_t AccumulatorNPerCta = cute::size<1>(TileShape_MNK{});
+  static constexpr uint32_t AccumulatorPipelineStageCount = (AccumulatorNPerCta == 256) ? 1 : 2;
+  static constexpr uint32_t SchedulerPipelineStageCount = 1;
+
+  using SmemTileShape = cute::Shape<BlockTileA_M, BlockTileB_N, BlockTileA_K>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_blockscaled_sparse<
+      ElementAMma_SmemAllocType,
+      ElementBMma_SmemAllocType,
+      ElementEMma,
+      SmemTileShape,
+      SmemLayoutAtomSFA,
+      SmemLayoutAtomSFB,
+      ClusterShape_MNK,
+      AccumulatorPipelineStageCount,
+      SchedulerPipelineStageCount>(StageCountType{});
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse<
+        PipelineStages,
+        SchedulerPipelineStageCount,
+        AccumulatorPipelineStageCount,
+        ClusterShape_MNK>;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      cute::tuple<ElementA, ElementSF>,
+      StridePairA,
+      cute::tuple<ElementB, ElementSF>,
+      StridePairB,
+      TiledMma,
+      GmemTiledCopyPairA,
+      SmemLayoutAtomsA,
+      void,
+      cute::identity,
+      GmemTiledCopyPairB,
+      SmemLayoutAtomsB,
+      void,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
index 31e3e923..6fd9c15f 100644
--- a/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl
@@ -29,9 +29,6 @@
  *
  **************************************************************************************************/
 #pragma once
-//
-
-//
 
 #include "cutlass/gemm/collective/builders/sm100_common.inl"
 #include "cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl"
@@ -90,488 +87,6 @@ sm100_compute_stage_count_or_override_blockscaled(StageCountAutoCarveout<carveou
   return (CapacityBytes - carveout_bytes) / stage_bytes;
 }
 
-template <class ClusterShapeMNK, class AtomThrId>
-constexpr auto
-sm100_cluster_shape_to_tma_atom_SFB(ClusterShapeMNK cluster_shape_mnk, AtomThrId atom_thr_id) {
-  static_assert(cute::rank(cluster_shape_mnk) == 3);
-  if constexpr (cute::size(atom_thr_id) == 2) {
-    // Always could use multicast feature for SFB with 2cta MMA.
-    return cute::SM100_TMA_2SM_LOAD_MULTICAST{};
-  }
-  else if constexpr (size(atom_thr_id) == 1) {
-      return detail::sm90_cluster_shape_to_tma_atom(cute::size<0>(cluster_shape_mnk));
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<ClusterShapeMNK>,
-        "Unsupported Configuration for SM100 TMA");
-  }
-}
-
-namespace blockscaled {
-
-enum class BlockScaledInstr {
-  MXF4_NVF4,
-  MXF4F6F8
-};
-
-template <class KernelScheduleType, class T>
-struct blockscaled_type {};
-
-template <class KernelScheduleType, class T, class SF>
-struct blockscaled_type<KernelScheduleType, cute::tuple<T,SF>> {
-  using sf_type = SF;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = detail::find_vector_size<KernelScheduleType>();
-};
-
-template <class KernelScheduleType, class T, class SF, int SfVectorSize_>
-struct blockscaled_type<KernelScheduleType, cute::tuple<T,SF, cute::Int<SfVectorSize_>>> {
-  using sf_type = SF;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = SfVectorSize_;
-};
-
-template <class KernelScheduleType, class T>
-struct blockscaled_type<KernelScheduleType, cutlass::mx_float6_t<T>> {
-  using sf_type = cutlass::float_ue8m0_t;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = 32;
-};
-
-template <class KernelScheduleType, class T>
-struct blockscaled_type<KernelScheduleType, cutlass::mx_float4_t<T>> {
-  using sf_type = cutlass::float_ue8m0_t;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = 32;
-};
-
-template <class KernelScheduleType, class T>
-struct blockscaled_type<KernelScheduleType, nv_float4_t<T>> {
-  using sf_type = cutlass::float_ue4m3_t;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = 16;
-};
-template <class KernelScheduleType, class T>
-struct blockscaled_type<KernelScheduleType, cutlass::mx_float8_t<T>> {
-  using sf_type = cutlass::float_ue8m0_t;
-  using data_type = T;
-  static constexpr uint32_t SfVectorSize = 32;
-};
-
-template <
-  class KernelScheduleType,
-  class ElementPairA, class ElementPairB,
-  UMMA::Major UmmaMajorA, UMMA::Major UmmaMajorB
->
-CUTLASS_HOST_DEVICE
-static constexpr bool
-check_input_datatypes() {
-  using ElementSFA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::sf_type;
-  using ElementSFB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::sf_type;
-  using ElementA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type;
-  using ElementB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type;
-  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::SfVectorSize;
-  constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::SfVectorSize;
-
-  auto is_auto_instr_selection_policy = [&]() {
-    return ((cute::is_same_v<KernelScheduleType, KernelScheduleAuto>)                                    ||
-            (cute::is_same_v<KernelScheduleType, KernelScheduleBlockScaledGemmSm100>)                    ||
-            (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized1SmBlockScaledSm100>)           ||
-            (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized2SmBlockScaledSm100>)           ||
-            (cute::is_same_v<KernelSchedulePtrArrayBlockScaledGemmSm100, KernelScheduleType>)            ||
-            (cute::is_same_v<KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100, KernelScheduleType>)   ||
-            (cute::is_same_v<KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100, KernelScheduleType>));
-  };
-  
-  static_assert(cute::is_same_v<ElementSFA, ElementSFB>, "Scale factor types for A and B should be the same.");
-  static_assert((SfVectorSizeA == SfVectorSizeB), "Scale factor vector size for A and B should be the same.");
-  if constexpr ((SfVectorSizeA == 0) || (SfVectorSizeB == 0)) {
-     static_assert(!is_auto_instr_selection_policy(), "Auto instr selection isn't valid if scale factor vector size can't be determined from the types");
-  }
-
-  static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t> 
-                || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>, "Incorrect scale factor type");
-
-    if constexpr (((sizeof_bits_v<ElementA> == 4 || sizeof_bits_v<ElementA> == 6 || sizeof_bits_v<ElementA> == 8) &&
-                   (sizeof_bits_v<ElementB> == 4 || sizeof_bits_v<ElementB> == 6 || sizeof_bits_v<ElementB> == 8)    ) &&  // A and B are 4, 6, or 8 bit types and
-                  (!(sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4)                                   )     // A and B are not both 4 bit types
-                 ) {
-      ///////////////////////////////////////////////////////////////////////
-      // Mixed Precision FP4, FP6, FP8 case. -> MX_F4F6F8 instructions
-      ///////////////////////////////////////////////////////////////////////
-      // 1. Check Scale factor data type
-      static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t>, "MX_F4F6F8 only supports ue8m0 SF type");
-      // 2. Check whether A and B type combinations are valid or not
-      static_assert(
-        ( // If runtime datatypes are used, then both A and B should be runtime data type
-          (
-           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float8_t> ||
-           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float6_t> ||
-           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t>
-          ) &&
-          (
-           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float8_t> ||
-           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float6_t> ||
-           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>
-          )
-        ) ||
-        ( // Valid (explicit) A and B type pairs
-          (
-           cute::is_same_v<ElementA, cutlass::float_e2m1_t> ||
-           cute::is_same_v<ElementA, cutlass::float_e2m3_t> ||
-           cute::is_same_v<ElementA, cutlass::float_e3m2_t> ||
-           cute::is_same_v<ElementA, cutlass::float_e4m3_t> ||
-           cute::is_same_v<ElementA, cutlass::float_e5m2_t> 
-          ) &&
-          (
-           cute::is_same_v<ElementB, cutlass::float_e2m1_t> ||
-           cute::is_same_v<ElementB, cutlass::float_e2m3_t> ||
-           cute::is_same_v<ElementB, cutlass::float_e3m2_t> ||
-           cute::is_same_v<ElementB, cutlass::float_e4m3_t> ||
-           cute::is_same_v<ElementB, cutlass::float_e5m2_t> 
-          )
-        ), "Incorrect types for A and B for MX_F4F6F8"
-      );
-      // 3. Check Scale factor vector size is valid. 
-      //   Only SfVectorSize = 32 is allowed.
-      static_assert((SfVectorSizeA == 32) && (SfVectorSizeB == 32), "Incorrect SfVectorSize for MX_F4F6F8 is deduced. SfVectorSize should be 32.");
-      // 4. Check the kernel policy. Kernel policy should be either auto or *MXf8f6f4*
-      static_assert((cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, KernelScheduleType>          ||
-                     cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, KernelScheduleType>  ||
-                     is_auto_instr_selection_policy()), "Incorrect Kernel Schedule Policy for Mx_F4F6F8 type inputs.");
-
-      return true;
-    }
-    else if constexpr ((sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4)) {
-      ///////////////////////////////////////////////////////////////////////
-      // A and B are both 4 bit types
-      // There are multiple block scaled tcgen05.mma instructions supporting F4 types.
-      ///////////////////////////////////////////////////////////////////////
-
-      // 1. Check Scale factor data type
-      static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t> 
-                      || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>
-                      , "MXNV_F4 supports ue8m0 and ue4m3 SF types");
-      // 2. Check whether A and B type combinations are valid or not
-      static_assert(
-         ( // If runtime datatypes are used, then both A and B should be runtime data type
-          cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> && 
-          cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>
-         ) ||
-         ( // Valid (explicit) A and B type pairs
-          (
-            cute::is_same_v<ElementA, cutlass::float_e2m1_t>
-          ) &&
-          (
-            cute::is_same_v<ElementB, cutlass::float_e2m1_t>
-          )
-         ), "Incorrect types for A and B for MXNV_F4");
-        // 3. Skip checking the scale factor vector size. Will be checked later for specific Kernel Schedule policies.
-        // 4. Check the kernel policy.
-        static_assert((cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, KernelScheduleType>          ||
-                       cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, KernelScheduleType>  ||
-                       cute::is_base_of_v<KernelScheduleMxNvf4Sm100, KernelScheduleType>          ||
-                       cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, KernelScheduleType>  ||
-                       is_auto_instr_selection_policy()), "Incorrect Kernel Schedule Policy for F4 type inputs.");
-        // If a policy is specified, do more checks
-        if constexpr (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, KernelScheduleType>
-                      || cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, KernelScheduleType>
-                     ) {
-          // Perform additional checks. Only subset of FP4 and scale factor types are supported.
-          static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t>, "MX_F4F6F8 only supports ue8m0 SF type");
-          static_assert((cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> && cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>) ||
-                        (cute::is_same_v<ElementA, cutlass::float_e2m1_t> && cute::is_same_v<ElementB, cutlass::float_e2m1_t>), "Incorrect types for A and B for MX_F4F6F8");
-          static_assert((SfVectorSizeA == 32) && (SfVectorSizeB == 32), "Incorrect SfVectorSize for MX_F4F6F8 is deduced. SfVectorSize should be 32.");
-          return true;
-        }
-        else if constexpr (cute::is_base_of_v<KernelScheduleMxNvf4Sm100, KernelScheduleType>
-                           || cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, KernelScheduleType>
-                          ) {
-            static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "MX/NV_F4 only supports RowMajor A, and ColMajorB");
-            static_assert(detail::find_vector_size<KernelScheduleType>() == SfVectorSizeA, "Kernel Schedule policy doesn't match the scale factor vector size.");
-          return true;
-        }
-        else { // auto policy
-          // If the scale factor type is ue4m3 or the scale factor vector size is 16 -> only MXF4_NVF4 instruction can support it
-          // For MXF4_NVF4, the layouts should be RowMajor A, and ColMajorB
-          static_assert(is_auto_instr_selection_policy(), "Kernel Schedule policy should be auto");
-          if constexpr (SfVectorSizeA == 16 || SfVectorSizeB == 16
-                        || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>
-                       ) { // Only MXF4NVF4 can support these types
-            static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "NV_F4 only supports RowMajor A, and ColMajorB");
-            return true;
-          }
-          return true;
-        }
-    }
-    else {
-      return false;
-    }
-  return false;
-}
-
-template <
-  class TileShape_MNK, // (MmaAtomShape_M, MmaAtomShape_N, CtaTileShapeK)
-  class ClusterShape_MNK,
-  class KernelScheduleType
->
-CUTLASS_HOST_DEVICE
-static constexpr bool
-is_2sm() {
-  // 2SM kernel schedule is requested
-  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType>) { return true; }
-  // 1SM kernel schedule is requested
-  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, KernelScheduleType>) { return false; }
-  // auto schedule is used.
-  else {
-    if constexpr (!cute::is_static_v<ClusterShape_MNK>) {
-      // If the cluster shape is dynamic, we can't guarantee 2x1. Default to 1sm.
-      // If tile shape M is 256, throw an error. M=256 is only supported by 2SM instructions.
-      static_assert(get<0>(TileShape_MNK{}) != 256, "If M=256, auto policy can't create 2sm kernels. Specify a 2SM policy");
-      return false;
-    }
-    else if constexpr (cute::is_static_v<ClusterShape_MNK> && cute::get<0>(ClusterShape_MNK{}) % 2 == 0) {
-      // We need to check the TileShape
-      if constexpr (get<0>(TileShape_MNK{}) == 256) {
-        return true;
-      }
-      else if constexpr (get<0>(TileShape_MNK{}) == 128) {
-        return false;
-      }
-      else {
-        static_assert(get<0>(TileShape_MNK{}) == 0, "Unsupported M dimension for TileShape_MNK.");
-      }
-    }
-    else { return false;}
-  }
-}
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  class KernelScheduleType
->
-CUTLASS_HOST_DEVICE
-static constexpr auto
-select_instr() {
-  using ElementSFA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::sf_type;
-  using ElementSFB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::sf_type;
-  using ElementA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type;
-  using ElementB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type;
-  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::SfVectorSize;
-  constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::SfVectorSize;
-  constexpr int SFVectorSize = SfVectorSizeA > SfVectorSizeB ? SfVectorSizeA : SfVectorSizeB;
-  using ElementSF = ElementSFA;
-
-  if constexpr (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, KernelScheduleType>
-                || cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, KernelScheduleType>
-                ) {
-    return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
-  }
-  else if constexpr (cute::is_base_of_v<KernelScheduleMxNvf4Sm100, KernelScheduleType>
-                      || cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, KernelScheduleType>
-                    ) {
-    return detail::blockscaled::BlockScaledInstr::MXF4_NVF4;
-  }
-  else {
-    // Auto scheduling
-    if constexpr ((sizeof_bits_v<ElementA> >= 6 && sizeof_bits_v<ElementA> <= 8) &&
-                  (sizeof_bits_v<ElementB> >= 6 && sizeof_bits_v<ElementB> <= 8)) {
-      // These types can only be supported by MX_F8F6F4 instruction
-      static_assert(SFVectorSize == 32, "Incorrect SF vector size");
-      return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
-    }
-    else if constexpr (( sizeof_bits_v<ElementA> == 4 && (sizeof_bits_v<ElementB> == 6 || sizeof_bits_v<ElementB> == 8)) ||
-                      ((sizeof_bits_v<ElementA> == 6 || sizeof_bits_v<ElementA> == 8) && sizeof_bits_v<ElementB> == 4)) {
-      // Fp4 can be mixed with FP6, Fp8 with MMA.MXF8F6F4 only
-      return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
-    }
-    else if constexpr (sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4) {
-      // Both A and B are 4bits
-      if constexpr (UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K) {
-        // MXF4_NVF4 possible
-        return detail::blockscaled::BlockScaledInstr::MXF4_NVF4;
-      }
-      else {
-        static_assert(SFVectorSize == 32, "Incorrect SF vector size");
-        static_assert( cute::is_same_v<ElementSF, cutlass::float_ue8m0_t> &&
-                      (cute::is_same_v<ElementA, cutlass::float_e2m1_t> && cute::is_same_v<ElementB, cutlass::float_e2m1_t> ||
-                        cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> && cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>),
-                      "Only MXF4 support with non-TN and MMA.MXF8F6F4.");
-        return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
-      }
-    }
-  }
-}
-
-} // namespace blockscaled
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  detail::blockscaled::BlockScaledInstr Instr,
-  class KernelScheduleType
->
-constexpr auto
-sm100_make_blockscaled_1sm_trivial_tiled_mma() {
-  // For MMA_1sm atoms, the MMA's AtomLayout is same as the ClusterShape
-  using AtomLayout_MNK = Layout<ClusterShape_MNK>;
-  constexpr int M = cute::size<0>(TileShape_MNK{});
-  static_assert(M == 128, "Invalid TileShape_M.");
-
-  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
-  constexpr int N = cute::size<1>(TileShape_MNK{});
-  static_assert(N == 128 || N == 192 || N == 256, "Invalid TileShape_N.");
-
-  using ElementSFA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::sf_type;
-  using ElementSFB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::sf_type;
-  using ElementA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type;
-  using ElementB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type;
-  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::SfVectorSize;
-  [[maybe_unused]] constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::SfVectorSize;
-
-  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementA, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
-  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementB, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
-
-  using ElementSF = ElementSFA;
-
-  if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8) {
-
-    return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
-                                                 M, N, UmmaMajorA, UmmaMajorB>{});
-  }
-  else if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4_NVF4) {
-    constexpr int SFVectorSize = SfVectorSizeA;
-    return make_tiled_mma(cute::SM100_MMA_MXF4_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
-                                                 M, N, SFVectorSize, UmmaMajorA, UmmaMajorB>{});
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<ElementAMma>,
-        "Unsupported configuration for SM100 collective builder.");
-  }
-}
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  detail::blockscaled::BlockScaledInstr Instr,
-  class KernelScheduleType
->
-constexpr auto
-sm100_make_blockscaled_2sm_trivial_tiled_mma() {
-
-  constexpr int M = cute::size<0>(TileShape_MNK{});
-  static_assert(M == 256, "Invalid TileShape_M.");
-
-  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
-  constexpr int N = cute::size<1>(TileShape_MNK{});
-  static_assert(N == 128 || N == 192 || N == 256, "Invalid TileShape_N.");
-
-  using ElementSFA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::sf_type;
-  using ElementSFB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::sf_type;
-  using ElementA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type;
-  using ElementB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type;
-  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::SfVectorSize;
-  [[maybe_unused]] constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::SfVectorSize;
-  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementA, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
-  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementB, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
-
-  using ElementSF = ElementSFA;
-
-  if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8) {
-    return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_2x1SM_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
-                                                       M, N, UmmaMajorA, UmmaMajorB>{});
-  }
-  else if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4_NVF4) {
-    constexpr int SFVectorSize = SfVectorSizeA > SfVectorSizeB ? SfVectorSizeA : SfVectorSizeB;
-    return make_tiled_mma(cute::SM100_MMA_MXF4_2x1SM_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
-                                                       M, N, SFVectorSize, UmmaMajorA, UmmaMajorB>{});
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<ElementAMma>,
-        "Unsupported configuration for SM100 collective builder.");
-  }
-}
-
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  detail::blockscaled::BlockScaledInstr Instr,
-  class KernelScheduleType,
-  bool Is2SM
->
-struct TrivialBlockscaledMma {};
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  detail::blockscaled::BlockScaledInstr Instr,
-  class KernelScheduleType
->
-struct TrivialBlockscaledMma <
-  ElementPairA,
-  ElementPairB,
-  ElementAccumulator,
-  TileShape_MNK,
-  ClusterShape_MNK,
-  UmmaMajorA,
-  UmmaMajorB,
-  Instr,
-  KernelScheduleType,
-  true /*Is2SM*/> {
-    using type = decltype(sm100_make_blockscaled_2sm_trivial_tiled_mma<ElementPairA, ElementPairB, ElementAccumulator,
-                                                        TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Instr, KernelScheduleType>());
-  };
-
-template <
-  class ElementPairA,
-  class ElementPairB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  UMMA::Major UmmaMajorA,
-  UMMA::Major UmmaMajorB,
-  detail::blockscaled::BlockScaledInstr Instr,
-  class KernelScheduleType
->
-struct TrivialBlockscaledMma<
-  ElementPairA,
-  ElementPairB,
-  ElementAccumulator,
-  TileShape_MNK,
-  ClusterShape_MNK,
-  UmmaMajorA,
-  UmmaMajorB,
-  Instr,
-  KernelScheduleType,
-  false /*Is2SM*/> {
-    using type = decltype(sm100_make_blockscaled_1sm_trivial_tiled_mma<ElementPairA, ElementPairB, ElementAccumulator,
-                                                        TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Instr, KernelScheduleType>());
-};
 } // namespace detail
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -587,7 +102,7 @@ template <
   class TileShape_MNK,        // (MmaAtomShapeM, MmaAtomShapeN, TileK)
   class ClusterShape_MNK,     // Static cluster shape or dynamic (int, int, _1)
   class StageCountType,
-  class KernelScheduleType
+  class BuilderScheduleTag
 >
 struct CollectiveBuilder<
     arch::Sm100,
@@ -602,56 +117,56 @@ struct CollectiveBuilder<
     TileShape_MNK,
     ClusterShape_MNK,
     StageCountType,
-    KernelScheduleType,
+    BuilderScheduleTag,
     cute::enable_if_t<
       // Blockscaled Gemm
-      (cute::is_base_of_v<KernelScheduleBlockScaledGemmSm100, KernelScheduleType> ||
-       cute::is_same_v<KernelScheduleAuto, KernelScheduleType>) 
+      (cute::is_base_of_v<KernelScheduleBlockScaledGemmSm100, BuilderScheduleTag> ||
+       cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>) 
        &&
       // Alignment check
-      detail::sm1xx_blockscaled_gemm_is_aligned<typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type,
+      detail::sm1xx_blockscaled_gemm_is_aligned<typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type,
                                                 AlignmentA,
-                                                typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type,
+                                                typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type,
                                                 AlignmentB,
-                                                KernelScheduleType>()>>
+                                                BuilderScheduleTag>()>>
 {
-  using ElementSFA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::sf_type;
-  using ElementSFB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::sf_type;
-  using ElementA = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairA>::data_type;
-  using ElementB = typename detail::blockscaled::blockscaled_type<KernelScheduleType, ElementPairB>::data_type;
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
   using ElementSF = ElementSFA;
 
   static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
   static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
 
   static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
-  static_assert(detail::blockscaled::check_input_datatypes<KernelScheduleType, ElementPairA, ElementPairB, UmmaMajorA, UmmaMajorB>(), "Incorrect input types");
+  static_assert(detail::blockscaled::check_input_datatypes<BuilderScheduleTag, ElementPairA, ElementPairB, UmmaMajorA, UmmaMajorB>(), "Incorrect input types");
 
-  static constexpr bool is_2sm = detail::blockscaled::is_2sm<TileShape_MNK, ClusterShape_MNK, KernelScheduleType>();
-  static constexpr auto Instr = detail::blockscaled::select_instr<ElementPairA, ElementPairB, ElementAccumulator, UmmaMajorA, UmmaMajorB, KernelScheduleType>();
+  static constexpr bool is_2sm = detail::blockscaled::is_2sm<TileShape_MNK, ClusterShape_MNK, BuilderScheduleTag>();
+  static constexpr auto Instr = detail::blockscaled::select_instr<ElementPairA, ElementPairB, ElementAccumulator, UmmaMajorA, UmmaMajorB, BuilderScheduleTag>();
 
   using TiledMma = typename cutlass::gemm::collective::detail::TrivialBlockscaledMma<ElementPairA, ElementPairB, ElementAccumulator,
                                                                   TileShape_MNK, ClusterShape_MNK,
-                                                                  UmmaMajorA, UmmaMajorB, Instr, KernelScheduleType, is_2sm>::type;
+                                                                  UmmaMajorA, UmmaMajorB, Instr, BuilderScheduleTag, is_2sm>::type;
 
   static constexpr bool UseMxf8f6f4 = Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8;
 
   static_assert(UseMxf8f6f4 || (cutlass::gemm::detail::is_k_major_A<GmemLayoutATag>() && cutlass::gemm::detail::is_k_major_B<GmemLayoutBTag>()), "Only MMA.MXF8F6F4 supports non-K major inputs");
 
   // Data type used by MMA instruction
-  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementA, UseMxf8f6f4>());
-  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementB, UseMxf8f6f4>());
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA, UseMxf8f6f4>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB, UseMxf8f6f4>());
 
-  static_assert(detail::sm100_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
-                                                                      TileShape_MNK, ClusterShape_MNK,
-                                                                      UmmaMajorA, UmmaMajorB, KernelScheduleType, is_2sm>(),
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, false /*is_sparse*/, is_2sm>(),
                 "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
 
   static constexpr uint32_t SFVectorSize = TiledMma::SFVecSize;
 
   // Basic storage block for new Scaling Factor Layouts
   using AtomThrID = typename TiledMma::AtomThrID;
-  using Sm100BlkScaledConfig = cutlass::detail::Sm100BlockScaledConfig<SFVectorSize>;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVectorSize>;
 
   using ElementAMma_SmemAllocType = cute::conditional_t<UseMxf8f6f4, uint8_t, ElementAMma>;
   using ElementBMma_SmemAllocType = cute::conditional_t<UseMxf8f6f4, uint8_t, ElementBMma>;
@@ -688,10 +203,10 @@ struct CollectiveBuilder<
 
   // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
   // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
-  using Blk_MN    = typename Sm100BlkScaledConfig::Blk_MN;
-  using Blk_SF    = typename Sm100BlkScaledConfig::Blk_SF; 
+  using Blk_MN    = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF    = typename Sm1xxBlkScaledConfig::Blk_SF; 
   using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
-  using SmemLayoutAtomSFA = decltype(Sm100BlkScaledConfig::deduce_smem_layoutSFA(TiledMma{}, TileShape_MNK{}));
+  using SmemLayoutAtomSFA = decltype(Sm1xxBlkScaledConfig::deduce_smem_layoutSFA(TiledMma{}, TileShape_MNK{}));
   using SmemLayoutAtomsA = decltype(cute::make_tuple(SmemLayoutAtomA{}, SmemLayoutAtomSFA{}));
 
   //
@@ -701,7 +216,7 @@ struct CollectiveBuilder<
   using BlockTileB_K = decltype(cute::size<0,1>(MmaShapeB_NK{}) * cute::size<2>(MmaShapeB_NK{}));
   using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::sm100_smem_selector<
       UmmaMajorB, ElementBMma_SmemAllocType, BlockTileB_N, BlockTileB_K>());
-  using SmemLayoutAtomSFB = decltype(Sm100BlkScaledConfig::deduce_smem_layoutSFB(TiledMma{}, TileShape_MNK{}));
+  using SmemLayoutAtomSFB = decltype(Sm1xxBlkScaledConfig::deduce_smem_layoutSFB(TiledMma{}, TileShape_MNK{}));
   using SmemLayoutAtomsB = decltype(cute::make_tuple(SmemLayoutAtomB{}, SmemLayoutAtomSFB{}));
 
   //
@@ -711,8 +226,8 @@ struct CollectiveBuilder<
   using StrideB = cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>;
   using InternalStrideA  = cute::remove_pointer_t<StrideA>;
   using InternalStrideB  = cute::remove_pointer_t<StrideB>;
-  using InternalLayoutSFA = decltype(Sm100BlkScaledConfig::deduce_layoutSFA());
-  using InternalLayoutSFB = decltype(Sm100BlkScaledConfig::deduce_layoutSFB());
+  using InternalLayoutSFA = decltype(Sm1xxBlkScaledConfig::deduce_layoutSFA());
+  using InternalLayoutSFB = decltype(Sm1xxBlkScaledConfig::deduce_layoutSFB());
   using LayoutSFA = cute::conditional_t<cute::is_same_v<InternalStrideA, StrideA>, InternalLayoutSFA, InternalLayoutSFA *>;
   using LayoutSFB = cute::conditional_t<cute::is_same_v<InternalStrideB, StrideB>, InternalLayoutSFB, InternalLayoutSFB *>;
   using StridePairA = decltype(cute::make_tuple(StrideA{}, LayoutSFA{}));
@@ -721,8 +236,8 @@ struct CollectiveBuilder<
   static constexpr int MMA_N = cute::size<1>(TileShape_MNK{});
   static constexpr uint32_t AccumulatorPipelineStageCount = (MMA_N == 256) ? 1 : 2;
   // Grouped GEMM (where Stride type is Stride*) does not use CLC based scheduler.
-  static constexpr uint32_t SchedulerPipelineStageCount = cute::is_same_v<InternalStrideA, StrideA> ? 3 : 1;
-  static constexpr bool IsArrayOfPointersGemm = cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, KernelScheduleType>;
+  static constexpr uint32_t SchedulerPipelineStageCount = 1;
+  static constexpr bool IsArrayOfPointersGemm = cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, BuilderScheduleTag>;
   static constexpr uint32_t KernelSmemCarveout = detail::Sm100DenseGemmTmaUmmaCarveout<
       ClusterShape_MNK,
       AccumulatorPipelineStageCount,
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
index c46687d1..d2b87315 100644
--- a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
@@ -128,7 +128,7 @@ template <
   class TileShape_MNK,
   class ClusterShape_MNK,
   class StageCountType,
-  class KernelScheduleType
+  class BuilderScheduleTag
 >
 struct CollectiveBuilder<
     arch::Sm100,
@@ -143,15 +143,15 @@ struct CollectiveBuilder<
     TileShape_MNK,    // (MmaAtomShapeM, MmaAtomShapeN, TileK)
     ClusterShape_MNK, // Static cluster shape or dynamic (int, int, _1)
     StageCountType,
-    KernelScheduleType,
+    BuilderScheduleTag,
     cute::enable_if_t<
       not cute::is_tuple_v<ElementA>   && not cute::is_tuple_v<ElementB> &&
       not cute::is_complex_v<ElementA> && not cute::is_complex_v<ElementB> &&
       cute::is_tuple_v<GmemLayoutATagPair>   && cute::is_tuple_v<GmemLayoutBTagPair> &&
       // Dense Gemm
-      cute::is_base_of_v<KernelScheduleSm100Blockwise, KernelScheduleType> &&
+      cute::is_base_of_v<KernelScheduleSm100Blockwise, BuilderScheduleTag> &&
       // Alignment check
-      detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, KernelScheduleType>()>>
+      detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, BuilderScheduleTag>()>>
 {
   static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
   static_assert(detail::check_input_datatypes<ElementA, ElementB>(), "Incorrect input types");
@@ -170,23 +170,23 @@ struct CollectiveBuilder<
   static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
 
   // Data type used by MMA instruction
-  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementA>());
-  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementB>());
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
 
-  static constexpr bool is_2sm = cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> ||
-                        (not cute::is_base_of_v<KernelSchedule1Sm, KernelScheduleType> &&
-                          not cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> &&
+  static constexpr bool is_2sm = cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> ||
+                        (not cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag> &&
+                          not cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> &&
                           cute::is_static_v<ClusterShape_MNK> &&
                           cute::get<0>(ClusterShape_MNK{}) % 2 == 0 );
 
-  static_assert(detail::sm100_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
                                                                       TileShape_MNK, ClusterShape_MNK,
-                                                                      UmmaMajorA, UmmaMajorB, KernelScheduleType, is_2sm>(),
+                                                                      GmemLayoutATag, GmemLayoutBTag, false /*is_sparse*/, is_2sm>(),
                 "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
   using TiledMma =  decltype(detail::sm100_make_trivial_tiled_mma<
       ElementAMma, ElementBMma, ElementAccumulator,
       decltype(cute::product_each(TileShape_MNK{})), ClusterShape_MNK,
-      UmmaMajorA, UmmaMajorB, KernelScheduleType>());
+      UmmaMajorA, UmmaMajorB, BuilderScheduleTag>());
 
   using ElementAMma_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
   using ElementBMma_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
diff --git a/include/cutlass/gemm/collective/builders/sm100_common.inl b/include/cutlass/gemm/collective/builders/sm100_common.inl
index 9f2542b5..6230c616 100644
--- a/include/cutlass/gemm/collective/builders/sm100_common.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_common.inl
@@ -28,9 +28,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-//
 
-//
 #pragma once
 
 #include "cutlass/gemm/gemm.h"
@@ -44,6 +42,7 @@
 #include "cutlass/layout/matrix.h" // cutlass::layout::RowMajor, cutlass::layout::ColumnMajor
 #include "cutlass/fast_math.h" // cutlass::round_up, cutlass::const_max
 #include "cutlass/arch/arch.h"
+#include "cutlass/gemm/collective/builders/sm1xx_common.inl"
 
 #include "cute/atom/mma_traits_sm100.hpp" // UMMA::Layout_MN_SW*
 #include "cute/atom/copy_traits_sm100_tma.hpp" // SM100_TMA_*SM_LOAD_*
@@ -71,76 +70,6 @@ constexpr int sm100_smem_capacity_bytes = cutlass::arch::sm100_smem_capacity_byt
 constexpr int CLCResponseSize =
     sizeof(typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm100<Shape<_1,_1,_1>,1>::CLCResponse{});
 
-// Maps input element to umma element
-template <class Element, bool IsF8F6F4 = true>
-constexpr auto
-sm100_kernel_input_element_to_mma_input_element() {
-  if constexpr (cute::is_same_v<Element, float>) {
-    return cutlass::tfloat32_t{};
-  }
-  else if constexpr (cute::is_same_v<Element, cutlass::float_e2m1_t> && IsF8F6F4) {
-    return cutlass::detail::float_e2m1_unpacksmem_t{};
-  }
-  else if constexpr (cute::is_same_v<Element, cutlass::float_e3m2_t> && IsF8F6F4) {
-    return cutlass::detail::float_e3m2_unpacksmem_t{};
-  }
-  else if constexpr (cute::is_same_v<Element, cutlass::float_e2m3_t> && IsF8F6F4) {
-    return cutlass::detail::float_e2m3_unpacksmem_t{};
-  }
-  else if constexpr (cute::is_same_v<Element, cutlass::type_erased_dynamic_float4_t> && IsF8F6F4) {
-    return cutlass::detail::type_erased_dynamic_float4_unpacksmem_t{};
-  }
-  else if constexpr (cute::is_same_v<Element, cutlass::type_erased_dynamic_float6_t> && IsF8F6F4) {
-    return cutlass::detail::type_erased_dynamic_float6_unpacksmem_t{};
-  }
-  else {
-    return Element{};
-  }
-}
-
-// Maps 2.x A matrix layout tag to respective UMMA major mode enum
-template <class Layout>
-constexpr cute::UMMA::Major
-tag_to_umma_major_A() {
-  using LayoutA = cute::remove_pointer_t<Layout>;
-  if constexpr (cute::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
-    return cute::UMMA::Major::K;
-  }
-  else if constexpr (cute::is_same_v<LayoutA, cutlass::layout::ColumnMajor>) {
-    return cute::UMMA::Major::MN;
-  }
-  else if constexpr (cutlass::detail::is_major<0, LayoutA>()) {
-    return cute::UMMA::Major::MN;
-  }
-  else if constexpr (cutlass::detail::is_major<1, LayoutA>()) {
-    return cute::UMMA::Major::K;
-  }
-  else {
-    static_assert(sizeof(LayoutA) == 0, "Invalid layout.");
-  }
-}
-
-// Maps 2.x B matrix layout tag to respective UMMA major mode enum
-template <class Layout>
-constexpr cute::UMMA::Major
-tag_to_umma_major_B() {
-  using LayoutB = cute::remove_pointer_t<Layout>;
-  if constexpr (cute::is_same_v<LayoutB, cutlass::layout::RowMajor>) {
-    return cute::UMMA::Major::MN;
-  }
-  else if constexpr (cute::is_same_v<LayoutB, cutlass::layout::ColumnMajor>) {
-    return cute::UMMA::Major::K;
-  }
-  else if constexpr (cutlass::detail::is_major<0, LayoutB>()) {
-    return cute::UMMA::Major::MN;
-  }
-  else if constexpr (cutlass::detail::is_major<1, LayoutB>()) {
-    return cute::UMMA::Major::K;
-  }
-  else {
-    static_assert(sizeof(LayoutB) == 0, "Invalid layout.");
-  }
-}
 
 // Helper for SS UMMA smem selection that considers a tensor TileShape:
 //   (BLK_MN, BLK_K)
@@ -203,6 +132,68 @@ sm100_smem_selector() {
   }
 }
 
+// Helper for SS UMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the largest UMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::UMMA::Major major, class ElementType, class BLK_MN, class BLK_K, class Sparsity>
+CUTE_HOST_DEVICE constexpr
+auto
+sm100_smem_selector_sparse()
+{
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+
+  if constexpr (major == cute::UMMA::Major::MN) {
+
+    // Handle the special case for F32 NT kernels
+    if constexpr ((sizeof(ElementType) == 4 && (BLK_MN0 % size<0>(UMMA::Layout_MN_SW128_32B_SpAtom<ElementType, Sparsity{}>{}) == 0))) {
+      return UMMA::Layout_MN_SW128_32B_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      // All other data types are handled as SM90
+      if constexpr      (BLK_MN0 % size<0>(UMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+        return UMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{};
+      }
+      else if constexpr (BLK_MN0 % size<0>(UMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+        return UMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{};
+      }
+      else if constexpr (BLK_MN0 % size<0>(UMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+        return UMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{};
+      }
+      else if constexpr (BLK_MN0 % size<0>(UMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+        return UMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{};
+      }
+      else {
+        static_assert(BLK_MN0 % size<0>(UMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                      "BLK_MN0 must be a multiple of size<0>(UMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{})");
+      }
+    }
+  }
+  else if constexpr (major == cute::UMMA::Major::K) {
+    if constexpr      (BLK_K0 % size<1>(UMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return UMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(UMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return UMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(UMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return UMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{})");
+    }
+  }
+}
+
 template <class ClusterShapeMNK, class AtomThrId>
 constexpr auto
 sm100_cluster_shape_to_tma_atom_A(ClusterShapeMNK cluster_shape_mnk, AtomThrId atom_thr_id) {
@@ -274,17 +265,29 @@ sm100_cluster_shape_to_tma_atom_B(ClusterShapeMNK cluster_shape_mnk, AtomThrId a
   }
 }
 
-template<class KernelScheduleType>
-constexpr uint32_t find_vector_size() {
-  if constexpr (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized1SmNvf4Sm100> ||
-                cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized2SmNvf4Sm100> ||
-                cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100> ||
-                cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100>
-              ) {
-    return 16;
+
+template <class ClusterShapeMNK, class AtomThrId>
+constexpr auto
+sm100_cluster_shape_to_tma_atom_SFB(ClusterShapeMNK cluster_shape_mnk, AtomThrId atom_thr_id) {
+  static_assert(cute::rank(cluster_shape_mnk) == 3);
+  if constexpr (cute::size(atom_thr_id) == 2) {
+    // Always could use multicast feature for SFB with 2cta MMA.
+    return cute::SM100_TMA_2SM_LOAD_MULTICAST{};
+  }
+  else if constexpr (size(atom_thr_id) == 1) {
+    constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShapeMNK>;
+    if constexpr (!IsDynamicCluster) {
+      return detail::sm90_cluster_shape_to_tma_atom(cute::size<0>(cluster_shape_mnk));
+    }
+    else {
+      // In the case of dynamic cluster, multicast decision is not known at compile time.
+      // A multicast instruction is forced by passing a cute::Int<2>{} to this helper. 
+      return detail::sm90_cluster_shape_to_tma_atom(cute::Int<2>{});
+    }
   }
   else {
-    return 32;
+    static_assert(cutlass::detail::dependent_false<ClusterShapeMNK>,
+        "Unsupported Configuration for SM100 TMA");
   }
 }
 
@@ -437,24 +440,24 @@ template<
   class ClusterShape_MNK,
   UMMA::Major UmmaMajorA,
   UMMA::Major UmmaMajorB,
-  class KernelScheduleType,
+  class BuilderScheduleTag,
   UMMA::ScaleIn ANeg = UMMA::ScaleIn::One,
   UMMA::ScaleIn BNeg = UMMA::ScaleIn::One
 >
 constexpr auto
 sm100_make_trivial_tiled_mma() {
   // MMA_2SM requested
-  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> ) {
+  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> ) {
     return sm100_make_2sm_trivial_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator,
                                     TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, ANeg, BNeg>();
   }
   // MMA_1SM requested
-  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, KernelScheduleType> ) {
+  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag> ) {
     return sm100_make_1sm_trivial_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator,
                                     TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, ANeg, BNeg>();
   }
   // Auto scheduling requested
-  else if constexpr (cute::is_same_v<KernelScheduleType, KernelScheduleAuto>) {
+  else if constexpr (cute::is_same_v<BuilderScheduleTag, KernelScheduleAuto>) {
     // Static cluster
     if constexpr (cute::is_static_v<ClusterShape_MNK>) {
       // For MMA_2SM we need a cluster shape that is multiple of 2x1
@@ -486,16 +489,16 @@ template<
   UMMA::Major UmmaMajorA,
   UMMA::Major UmmaMajorB,
   int Scale,
-  class KernelScheduleType
+  class BuilderScheduleTag
 >
 constexpr auto
 sm100_make_trivial_fastFP32_tiled_mma() {
   // MMA_2SM requested
-  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> ) {
+  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> ) {
     using AtomLayout_MNK = decltype(make_layout(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{})));
     constexpr int M = cute::size<0>(TileShape_MNK{});
     constexpr int N = cute::size<1>(TileShape_MNK{});
-    if constexpr (UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>) {
+    if constexpr (UmmaMajorA == cute::UMMA::Major::K && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>) {
       return make_tiled_mma(cute::SM100_MMA_F16BF16_2x1SM_TS_SCALED<ElementAMma, ElementBMma, ElementAccumulator,
                                                      M, N,  UmmaMajorA,  UmmaMajorB, Scale>{});
     }
@@ -505,11 +508,11 @@ sm100_make_trivial_fastFP32_tiled_mma() {
     }
   }
   // MMA_1SM requested
-  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, KernelScheduleType> ) {
+  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag> ) {
     // using AtomLayout_MNK = Layout<ClusterShape_MNK>;
     constexpr int M = cute::size<0>(TileShape_MNK{});
     constexpr int N = cute::size<1>(TileShape_MNK{});
-    if constexpr (UmmaMajorA == cute::UMMA::Major::K  && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>) {
+    if constexpr (UmmaMajorA == cute::UMMA::Major::K  && !cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>) {
       return make_tiled_mma(cute::SM100_MMA_F16BF16_TS_SCALED<ElementAMma, ElementBMma, ElementAccumulator,
                                                       M, N,  UmmaMajorA,  UmmaMajorB, Scale>{});
     }
@@ -518,17 +521,17 @@ sm100_make_trivial_fastFP32_tiled_mma() {
                                                       M, N,  UmmaMajorA,  UmmaMajorB, Scale>{});
     }
   }
-  else if constexpr (cute::is_same_v<KernelScheduleType, KernelScheduleSm100FastFP32Gemm> ||
-                     cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFastFP32SmemSm100> ||
-                     cute::is_same_v<KernelScheduleType, KernelScheduleSm100PtrArrayFastFP32Gemm> ||
-                     cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100>) {
+  else if constexpr (cute::is_same_v<BuilderScheduleTag, KernelScheduleSm100FastFP32Gemm> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedFastFP32SmemSm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelScheduleSm100PtrArrayFastFP32Gemm> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100>) {
     // Static cluster
     if constexpr (cute::is_static_v<ClusterShape_MNK>) {
       // For MMA_2SM we need a cluster shape that is multiple of 2x1
       // and only M=128 and M=256 are supported, otherwise, fall back to MMA_1SM
       if constexpr (cute::get<0>(ClusterShape_MNK{}) % 2 == 0 &&
                   (cute::get<0>(TileShape_MNK{}) / cute::get<0>(ClusterShape_MNK{})) % 64 == 0) {
-        if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>) {
+        if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>) {
           return sm100_make_trivial_fastFP32_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK,
                                             ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Scale, KernelTmaWarpSpecialized2SmFastFP32Sm100>();
         }
@@ -538,7 +541,7 @@ sm100_make_trivial_fastFP32_tiled_mma() {
         }
       }
       else {
-        if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>) {
+        if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>) {
           return sm100_make_trivial_fastFP32_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK,
                                               ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Scale, KernelTmaWarpSpecialized1SmFastFP32Sm100>();
         }
@@ -550,7 +553,7 @@ sm100_make_trivial_fastFP32_tiled_mma() {
     }
     // Dynamic cluster shape means we cannot assume we can use 2SM MMA 
     else {
-      if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, KernelScheduleType>) {
+      if constexpr (!cute::is_base_of_v<KernelTmaWarpSpecializedFastFP32SmemSm100, BuilderScheduleTag>) {
         return sm100_make_trivial_fastFP32_tiled_mma<ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK,
                                             ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Scale, KernelTmaWarpSpecialized1SmFastFP32Sm100>();
       }
@@ -566,92 +569,192 @@ sm100_make_trivial_fastFP32_tiled_mma() {
   }
 }
 
-/**
- * @brief Check for F8F6F4 alignment requirement
- * 
- * @tparam TileShape_MNK (MmaAtomShape_M, MmaAtomShape_N, TileShape_K)
- * @tparam ClusterShape_MNK (cluster_M, cluster_N, cluster_K)
- * @tparam KernelScheduleType Builder tag
- */
-template<
-  class ElementAMma,
-  class ElementBMma,
+
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
   class TileShape_MNK,
   class ClusterShape_MNK,
   UMMA::Major UmmaMajorA,
   UMMA::Major UmmaMajorB,
-  class KernelScheduleType,
-  bool Is2sm
+  detail::blockscaled::BlockScaledInstr Instr,
+  class BuilderScheduleTag
 >
-constexpr bool sm100_gemm_check_for_f8f6f4_mix8bit_requirement(){
-  
+constexpr auto
+sm100_make_blockscaled_1sm_trivial_tiled_mma() {
+  // For MMA_1sm atoms, the MMA's AtomLayout is same as the ClusterShape
+  using AtomLayout_MNK = Layout<ClusterShape_MNK>;
+  constexpr int M = cute::size<0>(TileShape_MNK{});
+  static_assert(M == 128, "Invalid TileShape_M.");
 
-  [[maybe_unused]] constexpr int TileShape_M = Is2sm ? size<0>(TileShape_MNK{}) / 2 : size<0>(TileShape_MNK{});
-  [[maybe_unused]] constexpr int TileShape_N = size<1>(TileShape_MNK{});
-  [[maybe_unused]] constexpr int TileShape_K = size<2>(TileShape_MNK{});
+  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
+  constexpr int N = cute::size<1>(TileShape_MNK{});
+  static_assert(N == 64 || N == 128 || N == 192 || N == 256, "Invalid TileShape_N.");
 
-  constexpr bool is_b_unpack_f4_f6 = cute::is_same_v<ElementBMma, cutlass::detail::float_e2m1_unpacksmem_t> ||
-                                   cute::is_same_v<ElementBMma, cutlass::detail::float_e3m2_unpacksmem_t> ||
-                                   cute::is_same_v<ElementBMma, cutlass::detail::float_e2m3_unpacksmem_t> ||
-                                   cute::is_same_v<ElementBMma, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
-                                   cute::is_same_v<ElementBMma, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t>;
-  constexpr bool is_a_unpack_f4_f6 = cute::is_same_v<ElementAMma, cutlass::detail::float_e2m1_unpacksmem_t> ||
-                                   cute::is_same_v<ElementAMma, cutlass::detail::float_e3m2_unpacksmem_t> ||
-                                   cute::is_same_v<ElementAMma, cutlass::detail::float_e2m3_unpacksmem_t> ||
-                                   cute::is_same_v<ElementAMma, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
-                                   cute::is_same_v<ElementAMma, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t>;
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+  [[maybe_unused]] constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::SfVectorSize;
 
-  [[maybe_unused]] constexpr bool is_b_n_major = UmmaMajorB == UMMA::Major::MN;
-  [[maybe_unused]] constexpr bool is_b_k_major = !is_b_n_major;
-  [[maybe_unused]] constexpr bool is_a_m_major = UmmaMajorA == UMMA::Major::MN;
-  [[maybe_unused]] constexpr bool is_a_k_major = !is_a_m_major;
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
 
-  // 2SM
-  if constexpr (Is2sm) {
-    constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
-                                                  TileShape_K % 128 == 0 :
-                                                  TileShape_M % 128 == 0);
+  using ElementSF = ElementSFA;
 
-    constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ?
-                                                  TileShape_N % 256 == 0: 
-                                                  TileShape_K % 128 == 0);
-    return valid_a && valid_b;
+  if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8) {
+    if constexpr (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ) {
+      return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_SS_SPARSE<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                          M, N, UmmaMajorA, UmmaMajorB>{});
+    }
+    else {
+      return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                  M, N, UmmaMajorA, UmmaMajorB>{});
+    }
+  }
+  else if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4_NVF4) {
+    constexpr int SfVectorSize = SfVectorSizeA;
+    if constexpr (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ) {
+      return make_tiled_mma(cute::SM100_MMA_MXF4NVF4_SS_SPARSE<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                          M, N, SfVectorSize, UmmaMajorA, UmmaMajorB>{});
+    }
+    else {
+      return make_tiled_mma(cute::SM100_MMA_MXF4_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                  M, N, SfVectorSize, UmmaMajorA, UmmaMajorB>{});
+    }
   }
-  // 1SM
   else {
-    constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
-                                                  TileShape_K % 128 == 0 :
-                                                  TileShape_M % 128 == 0);
-
-    constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ? 
-                                                  TileShape_N % 128 == 0 : 
-                                                  TileShape_K % 128 == 0);
-
-    return valid_a && valid_b;
+    static_assert(cutlass::detail::dependent_false<ElementAMma>,
+        "Unsupported configuration for SM100 collective builder.");
   }
 }
 
-template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, class KernelScheduleType>
-constexpr bool
-sm1xx_gemm_is_aligned() {
-  // Only support dense gemm alignment check
-  constexpr bool is_f8f6f4_subbytes = cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8;
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  detail::blockscaled::BlockScaledInstr Instr,
+  class BuilderScheduleTag
+>
+constexpr auto
+sm100_make_blockscaled_2sm_trivial_tiled_mma() {
 
-  return ((cute::sizeof_bits_v<ElementA> * AlignmentA) % cutlass::detail::get_input_alignment_bits<ElementA, is_f8f6f4_subbytes>() == 0) &&
-         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_f8f6f4_subbytes>() == 0);
+  constexpr int M = cute::size<0>(TileShape_MNK{});
+  static_assert(M == 256, "Invalid TileShape_M.");
+
+  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
+  constexpr int N = cute::size<1>(TileShape_MNK{});
+  static_assert(N == 64 || N == 128 || N == 192 || N == 256, "Invalid TileShape_N.");
+
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+  [[maybe_unused]] constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::SfVectorSize;
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB, Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8>());
+
+  using ElementSF = ElementSFA;
+
+  if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8) {
+    if constexpr (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ) {
+      return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_2x1SM_SS_SPARSE<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                                M, N, UmmaMajorA, UmmaMajorB>{});
+    }
+    else {
+      return make_tiled_mma(cute::SM100_MMA_MXF8F6F4_2x1SM_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                        M, N, UmmaMajorA, UmmaMajorB>{});
+    }
+  }
+  else if constexpr (Instr == detail::blockscaled::BlockScaledInstr::MXF4_NVF4) {
+    constexpr int SfVectorSize = SfVectorSizeA > SfVectorSizeB ? SfVectorSizeA : SfVectorSizeB;
+    if constexpr (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ) {
+      return make_tiled_mma(cute::SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                                M, N, SfVectorSize, UmmaMajorA, UmmaMajorB>{});
+    }
+    else {
+      return make_tiled_mma(cute::SM100_MMA_MXF4_2x1SM_SS<ElementAMma, ElementBMma, ElementAccumulator, ElementSF,
+                                                        M, N, SfVectorSize, UmmaMajorA, UmmaMajorB>{});
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementAMma>,
+        "Unsupported configuration for SM100 collective builder.");
+  }
 }
 
-template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, class KernelScheduleType>
-constexpr bool
-sm1xx_blockscaled_gemm_is_aligned() {
-  // Only support blocksscaled gemm alignment check
-  constexpr bool is_mxf8f6f4_subbytes = (cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8) &&
-                                    (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, KernelScheduleType> 
-                                    );
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  detail::blockscaled::BlockScaledInstr Instr,
+  class BuilderScheduleTag,
+  bool Is2SM
+>
+struct TrivialBlockscaledMma {};
 
-  return ((cute::sizeof_bits_v<ElementA> * AlignmentA) % cutlass::detail::get_input_alignment_bits<ElementA, is_mxf8f6f4_subbytes>() == 0) &&
-         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_mxf8f6f4_subbytes>() == 0);
-}
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  detail::blockscaled::BlockScaledInstr Instr,
+  class BuilderScheduleTag
+>
+struct TrivialBlockscaledMma <
+  ElementPairA,
+  ElementPairB,
+  ElementAccumulator,
+  TileShape_MNK,
+  ClusterShape_MNK,
+  UmmaMajorA,
+  UmmaMajorB,
+  Instr,
+  BuilderScheduleTag,
+  true /*Is2SM*/> {
+    using type = decltype(sm100_make_blockscaled_2sm_trivial_tiled_mma<ElementPairA, ElementPairB, ElementAccumulator,
+                                                        TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Instr, BuilderScheduleTag>());
+  };
+
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  detail::blockscaled::BlockScaledInstr Instr,
+  class BuilderScheduleTag
+>
+struct TrivialBlockscaledMma<
+  ElementPairA,
+  ElementPairB,
+  ElementAccumulator,
+  TileShape_MNK,
+  ClusterShape_MNK,
+  UmmaMajorA,
+  UmmaMajorB,
+  Instr,
+  BuilderScheduleTag,
+  false /*Is2SM*/> {
+    using type = decltype(sm100_make_blockscaled_1sm_trivial_tiled_mma<ElementPairA, ElementPairB, ElementAccumulator,
+                                                        TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB, Instr, BuilderScheduleTag>());
+};
 
 } // namespace detail
 
diff --git a/include/cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl b/include/cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl
index 82ae5395..8d8a9c5e 100644
--- a/include/cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl
@@ -51,8 +51,6 @@ struct Sm100DenseGemmTmaUmmaCarveout {
   static constexpr auto LoadOrderBarrierStorage = sizeof(typename cutlass::OrderedSequenceBarrier<1,2>::SharedStorage);
   // CLC (scheduler) response
   static constexpr auto CLCResponseStorage = SchedulerPipelineStageCount * detail::CLCResponseSize;
-  // CLC Throttle pipeline storage
-  static constexpr auto CLCThrottlePipelineStorage = sizeof(typename cutlass::PipelineAsync<SchedulerPipelineStageCount>::SharedStorage);
   // Tmem dealloc
   static constexpr auto TmemDeallocStorage = sizeof(cutlass::arch::ClusterBarrier);
   // Tmem ptr storage
@@ -66,7 +64,6 @@ struct Sm100DenseGemmTmaUmmaCarveout {
                                                                CLCPipelineStorage +
                                                                LoadOrderBarrierStorage +
                                                                TmemDeallocStorage +
-                                                               CLCThrottlePipelineStorage +
                                                                CLCResponseStorage +
                                                                TmemBasePtrsStorage +
                                                                TensorMapStorage
@@ -83,20 +80,14 @@ struct Sm100SparseGemmTmaUmmaCarveout {
   static constexpr auto CLCPipelineStorage = sizeof(typename cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape_MNK>::SharedStorage);
   // AccumulatorPipeline = PipelineUmmaAsync
   static constexpr auto AccumulatorPipelineStorage = sizeof(typename cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount>::SharedStorage);
-  // CLC Throttle pipeline storage
-  static constexpr auto CLCThrottlePipelineStorage = sizeof(typename cutlass::PipelineAsync<SchedulerPipelineStageCount>::SharedStorage);
   // Tmem dealloc
   static constexpr auto TmemDeallocStorage = sizeof(cutlass::arch::ClusterBarrier);
-  // Epilogue Throttle 
-  static constexpr auto EpilogueThrottleStorage = sizeof(arch::ClusterBarrier);
 
   static constexpr auto PipelineStorage = static_cast<int>(cutlass::round_up(
                                                       cutlass::round_up(LoadOrderBarrierStorage, 16) +
                                                       cutlass::round_up(CLCPipelineStorage, 16) +
                                                       cutlass::round_up(AccumulatorPipelineStorage, 16) +
-                                                      cutlass::round_up(CLCThrottlePipelineStorage, 16) +
-                                                      cutlass::round_up(TmemDeallocStorage, 8) +
-                                                      cutlass::round_up(EpilogueThrottleStorage, 8),
+                                                      cutlass::round_up(TmemDeallocStorage, 16),
                                                     16));
 
   // * GemmUniversal::SharedStorage::Others
diff --git a/include/cutlass/gemm/collective/builders/sm100_sparse_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_sparse_umma_builder.inl
new file mode 100644
index 00000000..20714e4c
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm100_sparse_umma_builder.inl
@@ -0,0 +1,414 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm100_common.inl"
+#include "cutlass/gemm/collective/builders/sm100_pipeline_carveout.inl"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template <
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class ClusterShapeMNK,
+  uint32_t AccumulatorPipelineStageCount,
+  uint32_t SchedulerPipelineStageCount,
+  int stages
+>
+constexpr int
+sm100_compute_stage_count_or_override_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template <
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class ClusterShapeMNK,
+  uint32_t AccumulatorPipelineStageCount,
+  uint32_t SchedulerPipelineStageCount,
+  int carveout_bytes
+>
+constexpr int
+sm100_compute_stage_count_or_override_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // For F8/F6/F4 sub-bytes, ElementAMma will be passed in as sparse_elem<uint8_t, Sparsity>, and
+  //    ElementB will be passed in as uint8_t
+
+  constexpr uint32_t KernelSmemCarveout = detail::Sm100SparseGemmTmaUmmaCarveout<
+                                            ClusterShapeMNK,
+                                            AccumulatorPipelineStageCount,
+                                            SchedulerPipelineStageCount,
+                                            detail::CLCResponseSize>::KernelSmemCarveout;
+
+  // * Compute Stage
+  constexpr bool is_hmma_two_kphase = (cutlass::bits_to_bytes(cute::sizeof_bits_v<typename ElementAMma::raw_type> * size<2>(TileShapeMNK{})) == 128) && 
+                                      (cute::sizeof_bits_v<typename ElementAMma::raw_type> == 16 ||
+                                       cute::sizeof_bits_v<typename ElementAMma::raw_type> == 32);
+
+  // Each stage include (CollectiveMma::SharedStorage)
+  // 1. smem for A,B,E (CollectiveMma::SharedStorage::TensorStorage)
+  // 2. one MainloopPipeline = (CollectiveMma::SharedStorage::PipelineStorage = PipelineTmaSparseUmmaAsync, two arrive-wait barrier)
+  constexpr auto MainloopPipelineStorage_per_Stage_aligned = static_cast<int>(cutlass::round_up(sizeof(cutlass::arch::ClusterBarrier) * 2, 16));
+
+  // a_bits, e_bits already consider sparsity through `sizeof_bits(ElementAMma)
+  // NOTE: sizeof_bits<sparse_elem<>> return integral_ratio instead of size_t
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr int MainloopTensorStorage_per_Stage =
+    cutlass::round_up(
+      cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+      cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{}) * (is_hmma_two_kphase ? 2 : 1)),
+    128);
+
+  constexpr auto EpilogueSharedStorage = carveout_bytes;
+
+  constexpr auto Stages = (cutlass::gemm::collective::detail::sm100_smem_capacity_bytes - KernelSmemCarveout - EpilogueSharedStorage) / 
+    (MainloopTensorStorage_per_Stage + MainloopPipelineStorage_per_Stage_aligned);
+
+  return Stages;
+}
+
+template<
+  class ElementAMmaRaw,
+  class ElementBMma,
+  class ElementAccumulator,
+  class TileShape_MNK, // (MmaAtomShape_M, MmaAtomShape_N, CtaTileShapeK)
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB
+>
+constexpr auto
+sm100_make_sparse_1sm_trivial_tiled_mma() {
+
+  constexpr int M = cute::size<0>(TileShape_MNK{});
+  static_assert(M == 64 || M == 128, "Invalid TileShape_M.");
+
+  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
+  constexpr int N = cute::size<1>(TileShape_MNK{});
+  static_assert(N % 8 == 0 && N <= 256, "Invalid TileShape_N.");
+
+  if constexpr    (cute::is_same_v<ElementAMmaRaw, cutlass::tfloat32_t>) {
+    static_assert(cute::is_same_v<ElementAMmaRaw, ElementBMma>, "ElementA and ElementBMma must match.");
+    return make_tiled_mma(cute::SM100_MMA_TF32_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                     M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw,     cutlass::half_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::bfloat16_t>) {
+    static_assert(cute::is_same_v<ElementAMmaRaw, ElementBMma>, "ElementA and ElementBMma must match.");
+    return make_tiled_mma(cute::SM100_MMA_F16BF16_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                     M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw,  int8_t> ||
+                     cute::is_same_v<ElementAMmaRaw, uint8_t>) {
+    return make_tiled_mma(cute::SM100_MMA_S8_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                     M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw, cutlass::float_e4m3_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::float_e5m2_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::type_erased_dynamic_float8_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e2m3_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e3m2_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e2m1_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t>) {
+    return make_tiled_mma(cute::SM100_MMA_F8F6F4_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                     M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementAMmaRaw>,
+        "Unsupported configuration for SM100 collective builder.");
+  }
+}
+
+template<
+  class ElementAMmaRaw,
+  class ElementBMma,
+  class ElementAccumulator,
+  class TileShape_MNK, // (MmaAtomShape_M, MmaAtomShape_N, CtaTileShapeK)
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB
+>
+constexpr auto
+sm100_make_sparse_2sm_trivial_tiled_mma() {
+
+  constexpr int M = cute::size<0>(TileShape_MNK{});
+  static_assert(M == 128 || M == 256, "Invalid TileShape_M.");
+
+  // Do not allow a tiled MMA N mode > 1, as that is not reasonable.
+  constexpr int N = cute::size<1>(TileShape_MNK{});
+  static_assert(N % 8 == 0 && N <= 256, "Invalid TileShape_N.");
+
+  if constexpr     (cute::is_same_v<ElementAMmaRaw, cutlass::tfloat32_t>) {
+    static_assert(cute::is_same_v<ElementAMmaRaw, ElementBMma>, "ElementA and ElementBMma must match.");
+    return make_tiled_mma(cute::SM100_MMA_TF32_2x1SM_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                           M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw,     cutlass::half_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::bfloat16_t>) {
+    static_assert(cute::is_same_v<ElementAMmaRaw, ElementBMma>, "ElementA and ElementBMma must match.");
+    return make_tiled_mma(cute::SM100_MMA_F16BF16_2x1SM_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                           M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw, int8_t> ||
+                     cute::is_same_v<ElementAMmaRaw, uint8_t>) {
+    return make_tiled_mma(cute::SM100_MMA_S8_2x1SM_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                           M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else if constexpr (cute::is_same_v<ElementAMmaRaw, cutlass::float_e4m3_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::float_e5m2_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::type_erased_dynamic_float8_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e2m3_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e3m2_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::float_e2m1_unpacksmem_t> ||
+                     cute::is_same_v<ElementAMmaRaw, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ) {
+    return make_tiled_mma(cute::SM100_MMA_F8F6F4_2x1SM_SS_SPARSE<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                           M, N, UmmaMajorA, UmmaMajorB>{});
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementAMmaRaw>,
+        "Unsupported configuration for SM100 collective builder.");
+  }
+}
+
+// MMA construction and partitioning that supports both dynamic and static cluster shape.
+// Used in conjunction with make_tma_atom_(A|B)_sm100
+// TileShape_MNK is always static and has shape (MmaAtomShapeM, MmaAtomShapeN, TileShapeK)
+// ClusterShape_MNK can be dynamic or static.
+template<
+  class ElementAMmaRaw,
+  class ElementBMma,
+  class ElementAccumulator,
+  class TileShape_MNK, // (MmaAtomShape_M, MmaAtomShape_N, TileShapeK)
+  class ClusterShape_MNK,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  class BuilderScheduleTag
+>
+constexpr auto
+sm100_make_trivial_tiled_mma_sparse() {
+  // MMA_2SM requested
+  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag>) {
+    return sm100_make_sparse_2sm_trivial_tiled_mma<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                    TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB>();
+  }
+  // MMA_1SM requested
+  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag>) {
+    return sm100_make_sparse_1sm_trivial_tiled_mma<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                   TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB>();
+  }
+  // Auto scheduling requested
+  else if constexpr (cute::is_same_v<BuilderScheduleTag, KernelScheduleAuto>) {
+    // Static cluster
+    if constexpr (cute::is_static_v<ClusterShape_MNK>) {
+      // For MMA_2SM we need a cluster shape that is multiple of 2x1
+      // and only M=128 and M=256 are supported, otherwise, fall back to MMA_1SM
+      if constexpr (cute::get<0>(ClusterShape_MNK{}) % 2 == 0 &&
+                    cute::get<0>(TileShape_MNK{}) % 128 == 0) {
+        return sm100_make_sparse_2sm_trivial_tiled_mma<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                       TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB>();
+      }
+      else {
+        return sm100_make_sparse_1sm_trivial_tiled_mma<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                       TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB>();
+      }
+    // Dynamic cluster shape means we cannot assume we can use 2SM MMA 
+    }
+    else {
+        return sm100_make_sparse_1sm_trivial_tiled_mma<ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                                                       TileShape_MNK, ClusterShape_MNK, UmmaMajorA, UmmaMajorB>();
+    }
+  }
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm100,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,    // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+    ClusterShape_MNK, // Static cluster shape or dynamic (int, int, _1)
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      (not cute::is_tuple_v<ElementA>   && not cute::is_tuple_v<ElementB> &&
+       not cute::is_complex_v<ElementA> && not cute::is_complex_v<ElementB> &&
+       not cute::is_sparse_v<ElementA>) &&
+      // Sparse Gemm
+      (cute::is_base_of_v<KernelScheduleSparseGemmSm100, BuilderScheduleTag> ||
+       cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>) &&
+      // Alignment check
+      detail::sm1xx_sparse_gemm_is_aligned<ElementA, AlignmentA, GmemLayoutATag, ElementB, AlignmentB, BuilderScheduleTag>()>>
+{
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+
+  static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
+  static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+
+  // Data type used by MMA instruction
+  using ElementAMmaRaw = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
+
+  static constexpr bool is_2sm = cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> ||
+                        (not cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag> &&
+                          not cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> &&
+                          cute::is_static_v<ClusterShape_MNK> &&
+                          cute::get<0>(ClusterShape_MNK{}) % 2 == 0 );
+
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMmaRaw, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, true /*is_sparse*/, is_2sm>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  using TiledMma = decltype(cutlass::gemm::collective::detail::sm100_make_trivial_tiled_mma_sparse<
+                              ElementAMmaRaw, ElementBMma, ElementAccumulator,
+                              TileShape_MNK, ClusterShape_MNK,
+                              UmmaMajorA, UmmaMajorB, BuilderScheduleTag>());
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using AtomThrID = typename TiledMma::AtomThrID;
+  using Sm1xxSparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma, GmemLayoutATag, ElementEMma>;
+
+  using ElementAMmaRaw_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementAMmaRaw> < 8, uint8_t, ElementAMmaRaw>;
+  using ElementAMma_SmemAllocType = cute::sparse_elem<ElementAMmaSparsity{}, ElementAMmaRaw_SmemAllocType>;
+  using ElementBMma_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using LayoutA = decltype(Sm1xxSparseConfig::deduce_layoutA());
+  using LayoutE = decltype(Sm1xxSparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  // ((MMA_TILE_M,MMA_TILE_K), MMA_M, MMA_K)
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(cute::size<0>(TileShape_MNK{}),
+                                                                         cute::size<2>(TileShape_MNK{}))));
+  // ((MMA_TILE_N,MMA_TILE_K), MMA_N, MMA_K)
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(cute::size<1>(TileShape_MNK{}),
+                                                                         cute::size<2>(TileShape_MNK{}))));
+
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_A(
+      ClusterShape_MNK{}, AtomThrID{}));
+
+  using BlockTileA_M = decltype(cute::size<0,0>(MmaShapeA_MK{}) * cute::size<1>(MmaShapeA_MK{}));
+  using BlockTileA_K = decltype(cute::size<0,1>(MmaShapeA_MK{}) * cute::size<2>(MmaShapeA_MK{}));
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::sm100_smem_selector_sparse<
+      UmmaMajorA, ElementAMmaRaw_SmemAllocType, BlockTileA_M, BlockTileA_K, ElementAMmaSparsity>());
+
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm100_cluster_shape_to_tma_atom_B(
+      ClusterShape_MNK{}, AtomThrID{}));
+
+  using BlockTileB_N = decltype(cute::size<0,0>(MmaShapeB_NK{}) * cute::size<1>(MmaShapeB_NK{}));
+  using BlockTileB_K = decltype(cute::size<0,1>(MmaShapeB_NK{}) * cute::size<2>(MmaShapeB_NK{}));
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::sm100_smem_selector<
+      UmmaMajorB, ElementBMma_SmemAllocType, BlockTileB_N, BlockTileB_K>());
+
+
+  // Calculate SMEM matrix A and B buffers' pipeline stages and the accumulator stages.
+  static constexpr uint32_t AccumulatorNPerCta = cute::size<1>(TileShape_MNK{});
+  static constexpr uint32_t AccumulatorPipelineStageCount = AccumulatorNPerCta > 224 ? 1 : 2;
+  static constexpr uint32_t SchedulerPipelineStageCount = 1;
+
+  using SmemTileShape = cute::Shape<BlockTileA_M, BlockTileB_N, BlockTileA_K>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_sparse<
+      ElementAMma_SmemAllocType,
+      ElementBMma_SmemAllocType,
+      ElementEMma,
+      SmemTileShape,
+      ClusterShape_MNK,
+      AccumulatorPipelineStageCount,
+      SchedulerPipelineStageCount>(StageCountType{});
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      cutlass::gemm::MainloopSm100TmaUmmaWarpSpecializedSparse<
+        PipelineStages,
+        SchedulerPipelineStageCount,
+        AccumulatorPipelineStageCount,
+        ClusterShape_MNK>,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      void,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
index 87b62de4..4cfff447 100644
--- a/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_umma_builder.inl
@@ -163,7 +163,7 @@ template <
   class TileShape_MNK,
   class ClusterShape_MNK,
   class StageCountType,
-  class KernelScheduleType
+  class BuilderScheduleTag
 >
 struct CollectiveBuilder<
     arch::Sm100,
@@ -178,16 +178,16 @@ struct CollectiveBuilder<
     TileShape_MNK,    // (MmaAtomShapeM, MmaAtomShapeN, TileK)
     ClusterShape_MNK, // Static cluster shape or dynamic (int, int, _1)
     StageCountType,
-    KernelScheduleType,
+    BuilderScheduleTag,
     cute::enable_if_t<
       not cute::is_tuple_v<ElementA>   && not cute::is_tuple_v<ElementB> &&
       not cute::is_complex_v<ElementA> && not cute::is_complex_v<ElementB> &&
       // Dense Gemm / PtrArrayDenseGemm
       (
-       (cute::is_base_of_v<KernelScheduleSm100DenseGemm, KernelScheduleType> ||
-        cute::is_same_v<KernelScheduleAuto, KernelScheduleType>)) &&
+       (cute::is_base_of_v<KernelScheduleSm100DenseGemm, BuilderScheduleTag> ||
+        cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)) &&
       // Alignment check
-      detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, KernelScheduleType>()>>
+      detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, BuilderScheduleTag>()>>
 {
   static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
   static_assert(detail::check_input_datatypes<ElementA, ElementB>(), "Incorrect input types");
@@ -196,23 +196,24 @@ struct CollectiveBuilder<
   static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
 
   // Data type used by MMA instruction
-  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementA>());
-  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm100_kernel_input_element_to_mma_input_element<ElementB>());
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
 
-  static constexpr bool is_2sm = cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> ||
-                        (not cute::is_base_of_v<KernelSchedule1Sm, KernelScheduleType> &&
-                          not cute::is_base_of_v<KernelSchedule2Sm, KernelScheduleType> &&
+  static constexpr bool is_2sm = cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> ||
+                        (not cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag> &&
+                          not cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag> &&
                           cute::is_static_v<ClusterShape_MNK> &&
                           cute::get<0>(ClusterShape_MNK{}) % 2 == 0 );
 
-  static_assert(detail::sm100_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
-                                                                      TileShape_MNK, ClusterShape_MNK,
-                                                                      UmmaMajorA, UmmaMajorB, KernelScheduleType, is_2sm>(),
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, false /*is_sparse*/, is_2sm>(),
                 "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
-  using TiledMma =  decltype(detail::sm100_make_trivial_tiled_mma<
+
+  using TiledMma = decltype(detail::sm100_make_trivial_tiled_mma<
       ElementAMma, ElementBMma, ElementAccumulator,
       decltype(cute::product_each(TileShape_MNK{})), ClusterShape_MNK,
-      UmmaMajorA, UmmaMajorB, KernelScheduleType>());
+      UmmaMajorA, UmmaMajorB, BuilderScheduleTag>());
 
   using ElementAMma_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
   using ElementBMma_SmemAllocType = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
@@ -253,9 +254,11 @@ struct CollectiveBuilder<
   static constexpr uint32_t TotalTmemRows = 128;
   static constexpr uint32_t Sm100TmemCapacityColumns = 512;
   static constexpr uint32_t TotalTmem = TotalTmemRows * Sm100TmemCapacityColumns;
-  static constexpr uint32_t AccumulatorPipelineStageCount = (is_2sm || (!is_2sm && size(shape<0,0>(MmaShapeA_MK{}) > 64))) ? 
+  static constexpr uint32_t AccumulatorPipelineStageCount_ = (is_2sm || (!is_2sm && size(shape<0,0>(MmaShapeA_MK{}) > 64))) ? 
                                                               TotalTmem / (cute::size<0>(CtaTileShape_MNK{}) * cute::size<1>(CtaTileShape_MNK{}))
                                                             : (Sm100TmemCapacityColumns / cute::size<1>(CtaTileShape_MNK{})) * 2;                       // 1SM MMA_M = 64 case
+  // 4 accumulator stages works well to buffer the accumulators, while also preventing overhead in the epilogue tail on small tile sizes.
+  static constexpr uint32_t AccumulatorPipelineStageCount = cute::min(4u, AccumulatorPipelineStageCount_);                                              // Cap at 4 accumulator stages
   static_assert(AccumulatorPipelineStageCount > 0, "Accumulator pipeline stage count must be positive.  This error probably means that TileShape_MNK and/or TiledMma::ThrLayoutVMNK are wrong.");
 
   // Calculate scheduler pipeline stages. Having one more stage than the accumulator allows more latency hiding.
@@ -263,8 +266,8 @@ struct CollectiveBuilder<
   using InternalStrideA  = cute::remove_pointer_t<StrideA>;
   // Grouped GEMM (where Stride type is Stride*) does not use CLC based scheduler.
   // SchedulerPipelineStageCount could be set to zero for Grouped GEMM, but we shouldn't define CLC Pipeline's barrier arrays of size zero.
-  static constexpr uint32_t SchedulerPipelineStageCount = cute::is_same_v<InternalStrideA, StrideA> ? (AccumulatorPipelineStageCount + 1) : 1;
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_base_of_v<KernelScheduleSm100PtrArrayDenseGemm, KernelScheduleType>);
+  static constexpr uint32_t SchedulerPipelineStageCount = 1;
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_base_of_v<KernelScheduleSm100PtrArrayDenseGemm, BuilderScheduleTag>);
   static constexpr uint32_t KernelSmemCarveout = detail::Sm100DenseGemmTmaUmmaCarveout<
       ClusterShape_MNK,
       AccumulatorPipelineStageCount,
diff --git a/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
new file mode 100755
index 00000000..062631e7
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm120_common.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+
+template <
+  class ElementPairA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementPairB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm120,
+    arch::OpClassBlockScaledTensorOp,
+    ElementPairA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementPairB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      // Blockscaled Gemm
+      (cute::is_base_of_v<KernelScheduleBlockScaledGemmSm120, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+       cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)
+       &&
+      // Alignment check
+      detail::sm1xx_blockscaled_gemm_is_aligned<typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type,
+                                                AlignmentA,
+                                                typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type,
+                                                AlignmentB,
+                                                BuilderScheduleTag>()>>
+{
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  static_assert(cute::is_same_v<ElementSFA, ElementSFB>, "Scale factor types for A and B must be the same.");
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  using ElementSF = ElementSFA;
+  static constexpr auto SFVectorSize = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+  static_assert((SFVectorSize == detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::SfVectorSize),  
+                "Scale factor vector size for A and B must be the same.");
+
+  static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
+  static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+  static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
+  static_assert(detail::blockscaled::check_input_datatypes<BuilderScheduleTag, ElementPairA, ElementPairB, UmmaMajorA, UmmaMajorB>(), "Incorrect input types");
+  static_assert(cute::size(ClusterShape_MNK{}) == Int<1>{}, "no programmatic multicast on this arch");
+  static_assert(size<1>(TileShape_MNK{}) >= 32, "Invalid tile shape N.");
+
+  static constexpr auto Instr = detail::blockscaled::select_instr<ElementPairA,
+                                                                  ElementPairB,
+                                                                  ElementAccumulator,
+                                                                  UmmaMajorA,
+                                                                  UmmaMajorB,
+                                                                  BuilderScheduleTag>();
+  static constexpr bool UseMxf8f6f4 = Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+
+  using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
+  using PermTileN = decltype(detail::sm120_tile_n_permute_selector<SFVectorSize>());
+  using PermTileK = cute::conditional_t<UseMxf8f6f4, _32, _64>;
+
+  static constexpr bool IsCooperative = !cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag>;
+  // Data type used by MMA instruction
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
+
+  static_assert(!cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag> ||
+                detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, false /*IsSparse*/>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_4,_2,_1>>, Layout<Shape<_2,_2,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+    cute::rr_blockscaled_op_selector_sm120<ElementA, ElementB, ElementAccumulator, ElementSF, SFVectorSize, UseMxf8f6f4>(),
+    AtomLayoutMNK{},
+    Tile<PermTileM, PermTileN, PermTileK>{}
+  ));
+
+  static constexpr int MMA_NSF = size<2>(typename TiledMma::AtomShape_MNK{}) / SFVectorSize;
+
+  using SmemAllocTypeA = cute::conditional_t<UseMxf8f6f4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<UseMxf8f6f4, uint8_t, typename TiledMma::ValTypeB>;
+  using SmemAllocTypeSF = ElementSF;
+
+  using GmemTiledCopyA = SM90_TMA_LOAD;
+  using GmemTiledCopyB = SM90_TMA_LOAD;
+
+  using GmemTiledCopySFA = SM90_TMA_LOAD;
+  using GmemTiledCopySFB = SM90_TMA_LOAD;
+
+  using GmemTiledCopyPairA = decltype(cute::make_tuple(GmemTiledCopyA{}, GmemTiledCopySFA{}));
+  using GmemTiledCopyPairB = decltype(cute::make_tuple(GmemTiledCopyB{}, GmemTiledCopySFB{}));
+
+  // Setup Config
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVectorSize>;
+
+  using SmemLayoutAtomA = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeA, decltype(size<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeB, decltype(size<2>(TileShape_MNK{}))>());
+
+  using SmemCopyAtomA = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_A<ElementA, ElementB, UseMxf8f6f4>()), SmemAllocTypeA>;
+  using SmemCopyAtomB = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_B<ElementA, ElementB, UseMxf8f6f4>()), SmemAllocTypeB>;
+
+  using SmemCopyAtomSF = Copy_Atom<UniversalCopy<SmemAllocTypeSF>, SmemAllocTypeSF>; // auto-vectorized LDS
+  using SmemCopyAtomSFA = SmemCopyAtomSF;
+  using SmemCopyAtomSFB = SmemCopyAtomSF;
+
+  using SmemCopyAtomsA = decltype(cute::make_tuple(SmemCopyAtomA{}, SmemCopyAtomSFA{}));
+  using SmemCopyAtomsB = decltype(cute::make_tuple(SmemCopyAtomB{}, SmemCopyAtomSFB{}));
+
+  // Construct SMEM layout for SF
+  // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
+  // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+  using Blk_MN    = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF    = typename Sm1xxBlkScaledConfig::Blk_SF; 
+  using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
+
+  // Basic storage block for new Scaling Factor Layouts
+  using mnBasicBlockShape  =  Shape<_32,_4>;
+  using mnBasicBlockStride = Stride<_16,_4>;
+  using kBasicBlockShape  = Shape<Int<SFVectorSize>, Int<MMA_NSF>>;
+  using kBasicBlockStride = Stride<_0, _1>;
+  
+  using sSFA_shapeM       = decltype(prepend(size<0>(TileShape_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
+  using sSF_strideMN      = decltype(prepend(                        Blk_Elems{},  mnBasicBlockStride{}));
+  using sSFA_strideM      = sSF_strideMN;
+  using sSF_shapeK        = decltype(prepend(make_shape( Blk_SF{}/Int<MMA_NSF>{},   size<2>(TileShape_MNK{}) / Int<SFVectorSize>{} / Blk_SF{}),  kBasicBlockShape{}));
+  
+  using sSFA_strideK      = decltype(prepend(make_stride(         Int<MMA_NSF>{},   size<0>(TileShape_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSFA_shape        = decltype(make_shape(  sSFA_shapeM{},   sSF_shapeK{}));
+  using sSFA_stride       = decltype(make_stride(sSFA_strideM{}, sSFA_strideK{}));
+  using SmemLayoutAtomSFA = decltype(make_layout(  sSFA_shape{},  sSFA_stride{}));
+
+  using sSFB_shapeN       = decltype(prepend(size<1>(TileShape_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
+  using sSFB_strideN      = sSF_strideMN;
+  using sSFB_strideK      = decltype(prepend(make_stride(Int<MMA_NSF>{},   size<1>(TileShape_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSFB_shape        = decltype(make_shape(  sSFB_shapeN{},   sSF_shapeK{}));
+  using sSFB_stride       = decltype(make_stride(sSFB_strideN{}, sSFB_strideK{}));
+  using SmemLayoutAtomSFB = decltype(make_layout(  sSFB_shape{},  sSFB_stride{}));
+
+  using SmemLayoutAtomsA = decltype(cute::make_tuple(SmemLayoutAtomA{}, SmemLayoutAtomSFA{}));
+  using SmemLayoutAtomsB = decltype(cute::make_tuple(SmemLayoutAtomB{}, SmemLayoutAtomSFB{}));
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_blockscaled<
+    detail::sm120_smem_capacity_bytes, SmemAllocTypeA, SmemAllocTypeB, TileShape_MNK, SmemLayoutAtomSFA, SmemLayoutAtomSFB>(StageCountType{});
+
+  static constexpr uint32_t SchedulerPipelineStageCount = 3;
+
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedBlockScaled<PipelineStages,
+                                                                    SchedulerPipelineStageCount,
+                                                                    ClusterShape_MNK,
+                                                                    BuilderScheduleTag>;
+  static_assert(cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename DispatchPolicy::Schedule> ||
+                cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename DispatchPolicy::Schedule>, 
+                "Unsupported kernel schedule by this collective mainloop dispatch policy.");
+
+  using StridePairA = decltype(cute::make_tuple(cutlass::gemm::TagToStrideA_t<GmemLayoutATag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFA()));
+  using StridePairB = decltype(cute::make_tuple(cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFB()));
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      cute::tuple<ElementA, ElementSF>,
+      StridePairA,
+      cute::tuple<ElementB, ElementSF>,
+      StridePairB,
+      TiledMma,
+      GmemTiledCopyPairA,
+      SmemLayoutAtomsA,
+      SmemCopyAtomsA,
+      cute::identity,
+      GmemTiledCopyPairB,
+      SmemLayoutAtomsB,
+      SmemCopyAtomsB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm120_blockscaled_sparse_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_blockscaled_sparse_mma_builder.inl
new file mode 100755
index 00000000..e127c38c
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm120_blockscaled_sparse_mma_builder.inl
@@ -0,0 +1,350 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm120_common.inl"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template <
+  int CapacityBytes,
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class TileShapeSFA,
+  class TileShapeSFB,
+  int stages
+>
+constexpr int
+sm120_compute_stage_count_or_override_blockscaled_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity.
+// With limited SMEM capacity, F8/F6/F4 MMA with larger tiles, such as 128x128, cannot
+// get 2 stages. Therefore, use 1.5 stages for B.
+template <
+  int CapacityBytes,
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class TileShapeSFA,
+  class TileShapeSFB,
+  int carveout_bytes
+>
+constexpr auto
+sm120_compute_stage_count_or_override_blockscaled_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // For MXF8F6F4 & MXF4NVF4 sub-bytes, ElementAMma will be passed in as sparse_elem<uint8_t, Sparsity>, and
+  //    ElementB will be passed in as uint8_t
+  // Each stage include (CollectiveMma::SharedStorage)
+  // 1. smem for A/E and smem for B (CollectiveMma::SharedStorage::TensorStorage)
+  // 2. smem for SFA and smem for SFB (CollectiveMma::SharedStorage::TensorStorage, independent of input size b.c. sizeof(sf) is fixed)
+  // 3. one MainloopPipeline = (CollectiveMma::SharedStorage::PipelineStorage = PipelineTmaSparseUmmaAsync, three arrive-wait barrier)
+  constexpr auto mainloop_pipeline_bytes = sizeof(cutlass::arch::ClusterBarrier) * 3;
+  // a_bits, e_bits already consider sparsity through `sizeof_bits(ElementAMma)
+  // NOTE: sizeof_bits<sparse_elem<>> return integral_ratio instead of size_t.
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr auto stage_sfa_bytes = size(filter_zeros(TileShapeSFA{}));
+  constexpr auto stage_sfb_bytes = size(filter_zeros(TileShapeSFB{}));
+
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes + stage_sfa_bytes + stage_sfb_bytes);
+
+  // When stage >=2, it can be used directly.
+  constexpr int stages = (CapacityBytes - carveout_bytes) / stage_bytes;
+  if constexpr (stages >= 2) {
+    return cute::make_tuple(stages, stages);
+  }
+
+  // When stage < 2, try to use half of TileK, aka size<2>(TileShapeMNK{}),
+  //   and increase its stages. Eg. when input TileK is 256, instead of
+  //   buffering K=256 with 2 stages, it uses K=128, with 3 stages. From the
+  //   kernel's TileK view (K=256), B is 1.5 stages.
+  // Note, if B apply 1.5 stages, metadata is kept in L2, as limited SMEM capacity.
+  //  A/B is with asymmetric DMA and buffering, as they are with different
+  //  TileK and buffer advance steps.
+  constexpr int stage_bytes_b15 =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) * 3 / 4 +
+    static_cast<int>(mainloop_pipeline_bytes + stage_sfa_bytes + stage_sfb_bytes);
+
+  constexpr int stages_b15 = (CapacityBytes - carveout_bytes) / stage_bytes_b15;
+  if constexpr (stages_b15 >= 2) {
+    return cute::make_tuple(stages_b15, 3);
+  }
+
+  return cute::make_tuple(stages_b15, stages_b15);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementPairA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementPairB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm120,
+    arch::OpClassBlockScaledSparseTensorOp,
+    ElementPairA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementPairB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      // Blockscaled Sparse Gemm
+      cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>
+       &&
+      // Alignment check
+      detail::sm1xx_blockscaled_sparse_gemm_is_aligned<typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type,
+                                                       AlignmentA,
+                                                       GmemLayoutATag,
+                                                       typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type,
+                                                       AlignmentB,
+                                                       BuilderScheduleTag>()>>
+{
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  using ElementSF = ElementSFA;
+  static constexpr auto SFVectorSize = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+
+  static constexpr cute::UMMA::Major UmmaMajorA = cutlass::gemm::collective::detail::tag_to_umma_major_A<GmemLayoutATag>();
+  static constexpr cute::UMMA::Major UmmaMajorB = cutlass::gemm::collective::detail::tag_to_umma_major_B<GmemLayoutBTag>();
+
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+  static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
+  static_assert(detail::blockscaled::check_input_datatypes<BuilderScheduleTag, ElementPairA, ElementPairB, UmmaMajorA, UmmaMajorB>(), "Incorrect input types");
+  static_assert(cute::size(ClusterShape_MNK{}) == Int<1>{}, "no programmatic multicast on this arch");
+
+  static constexpr auto Instr = detail::blockscaled::select_instr<ElementPairA,
+                                                                  ElementPairB,
+                                                                  ElementAccumulator,
+                                                                  UmmaMajorA,
+                                                                  UmmaMajorB,
+                                                                  BuilderScheduleTag>();
+  static constexpr bool UseMxf8f6f4 = Instr == detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+
+  using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
+  using PermTileN = decltype(cute::min(size<1>(TileShape_MNK{}),  _32{}));
+  // For MXF8F6F4 MMA, FP4/FP6 types are in 8-bits containers in registers. For MXF4NVF4 MMA, it is 2 elements in one register.
+  //     Operand A and B are coming from registers.
+  using SizeOfBits_InRegs = cute::conditional_t<UseMxf8f6f4, _8, _4>;
+  //  MXF4NVF4 MMA has 2:4 sparsity, while MXF8F6F4 MMA has 1:2. Therefore, we need to update K-dim bits from 256 to 512 to 
+  //     find the logical K-dim of MMA.
+  using PermTileK = decltype(_512{} / SizeOfBits_InRegs{});
+  
+  // Data type used by MMA instruction
+  using ElementAMmaRaw = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
+
+  static_assert(!cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag> ||
+                detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMmaRaw, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, true /*is_sparse*/>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  static constexpr bool Use2x4AtomLayout = cute::is_base_of_v<KernelScheduleAcc2x4Sm120, BuilderScheduleTag>;
+  using AtomLayoutMNK = cute::conditional_t<Use2x4AtomLayout, 
+                                            Layout<Shape<_2,_4,_1>>,
+                                            Layout<Shape<_4,_2,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+    cute::rr_blockscaled_sparse_op_selector_sm120<ElementA, ElementB, ElementAccumulator, ElementSF, SFVectorSize, UseMxf8f6f4>(),
+    AtomLayoutMNK{},
+    Tile<PermTileM, PermTileN, PermTileK>{}
+  ));
+
+  static constexpr int MMA_NSF = size<2>(typename TiledMma::AtomShape_MNK{}) / SFVectorSize;
+
+  using SmemAllocTypeA = cute::conditional_t<UseMxf8f6f4, uint8_t, typename TiledMma::ValTypeA::raw_type>;
+  using SmemAllocTypeB = cute::conditional_t<UseMxf8f6f4, uint8_t, typename TiledMma::ValTypeB>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementASparsity = Int<ElementAMma::sparsity>;
+  using SmemAllocTypeE = typename TiledMma::FrgTypeE::raw_type;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementESparsity = Int<ElementEMma::sparsity>;
+  using SmemAllocTypeSF = ElementSF;
+
+  using GmemTiledCopyA = SM90_TMA_LOAD;
+  using GmemTiledCopyB = SM90_TMA_LOAD;
+
+  using GmemTiledCopySFA = SM90_TMA_LOAD;
+  using GmemTiledCopySFB = SM90_TMA_LOAD;
+
+  using GmemTiledCopyPairA = decltype(cute::make_tuple(GmemTiledCopyA{}, GmemTiledCopySFA{}));
+  using GmemTiledCopyPairB = decltype(cute::make_tuple(GmemTiledCopyB{}, GmemTiledCopySFB{}));
+
+  // Setup Config
+  using Sm1xxSparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma, GmemLayoutATag, ElementEMma>;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVectorSize>;
+
+  using StrideA = TagToStrideA_t<GmemLayoutATag>;
+  using LayoutA = decltype(Sm1xxSparseConfig::deduce_layoutA());
+  using LayoutE = decltype(Sm1xxSparseConfig::deduce_layoutE());
+  using LayoutSFA = decltype(Sm1xxBlkScaledConfig::deduce_layoutSFA());
+  using LayoutTupleA = decltype(cute::make_tuple(LayoutA{}, LayoutE{}, LayoutSFA{}, StrideA{}));
+
+  using SmemLayoutAtomA = decltype(detail::sm120_rr_smem_selector_sparse<SmemAllocTypeA, decltype(size<2>(TileShape_MNK{})), ElementASparsity>());
+  using SmemLayoutAtomB = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeB, decltype(size<2>(TileShape_MNK{}))>());
+
+  using SmemCopyAtomA = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_A<ElementA, ElementB, UseMxf8f6f4>()), sparse_elem<ElementAMma::sparsity,SmemAllocTypeA>>;
+  using SmemCopyAtomB = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_B<ElementA, ElementB, UseMxf8f6f4>()), SmemAllocTypeB>;
+  using SmemCopyAtomE = Copy_Atom<UniversalCopy<uint64_t>, sparse_elem<ElementEMma::sparsity,SmemAllocTypeE>>;
+  using SmemCopyAtomAPair = decltype(make_tuple(SmemCopyAtomA{}, SmemCopyAtomE{}));
+
+  using SmemCopyAtomSF = Copy_Atom<UniversalCopy<SmemAllocTypeSF>, SmemAllocTypeSF>; // auto-vectorized LDS
+  using SmemCopyAtomSFA = SmemCopyAtomSF;
+  using SmemCopyAtomSFB = SmemCopyAtomSF;
+
+  using SmemCopyAtomsA = decltype(cute::make_tuple(SmemCopyAtomA{}, SmemCopyAtomE{}, SmemCopyAtomSFA{}));
+  using SmemCopyAtomsB = decltype(cute::make_tuple(SmemCopyAtomB{}, SmemCopyAtomSFB{}));
+
+  // Construct SMEM layout for SF
+  // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
+  // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+  using Blk_MN    = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF    = typename Sm1xxBlkScaledConfig::Blk_SF; 
+  using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
+
+  // Construct TileShape for SFB load from GMEM to SMEM.
+  // It is required to keep consistency with BlockScaled granularity defined in Sm1xxBlkScaledConfig.
+  // So that TileShape for scaling factor needs to be defined as a mutliple of Blk_MN.
+  using TileShapeSf_MNK = decltype(make_shape(ceil_div(size<0>(TileShape_MNK{}), Blk_MN{}) * Blk_MN{},
+                                           ceil_div(size<1>(TileShape_MNK{}), Blk_MN{}) * Blk_MN{}, 
+                                           shape<2>(TileShape_MNK{})));
+
+  // Basic storage block for new Scaling Factor Layouts
+  using mnBasicBlockShape  =  Shape<_32,_4>;
+  using mnBasicBlockStride = Stride<_16,_4>;
+  using kBasicBlockShape  = Shape<Int<SFVectorSize>, Int<MMA_NSF>>;
+  using kBasicBlockStride = Stride<_0, _1>;
+  
+  using sSFA_shapeM       = decltype(prepend(size<0>(TileShape_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
+  using sSF_strideMN      = decltype(prepend(                        Blk_Elems{},  mnBasicBlockStride{}));
+  using sSFA_strideM      = sSF_strideMN;
+  using sSF_shapeK        = decltype(prepend(make_shape( Blk_SF{}/Int<MMA_NSF>{},   size<2>(TileShape_MNK{}) / Int<SFVectorSize>{} / Blk_SF{}),  kBasicBlockShape{}));
+  
+  using sSFA_strideK      = decltype(prepend(make_stride(         Int<MMA_NSF>{},   size<0>(TileShape_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSFA_shape        = decltype(make_shape(  sSFA_shapeM{},   sSF_shapeK{}));
+  using sSFA_stride       = decltype(make_stride(sSFA_strideM{}, sSFA_strideK{}));
+  using SmemLayoutAtomSFA = decltype(make_layout(  sSFA_shape{},  sSFA_stride{}));
+
+  using sSFB_shapeN       = decltype(prepend(size<1>(TileShapeSf_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
+  using sSFB_strideN      = sSF_strideMN;
+  using sSFB_strideK      = decltype(prepend(make_stride(Int<MMA_NSF>{},   size<1>(TileShapeSf_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSFB_shape        = decltype(make_shape(  sSFB_shapeN{},   sSF_shapeK{}));
+  using sSFB_stride       = decltype(make_stride(sSFB_strideN{}, sSFB_strideK{}));
+  using SmemLayoutAtomSFB = decltype(make_layout(  sSFB_shape{},  sSFB_stride{}));
+
+  using SmemLayoutAtomsA = decltype(cute::make_tuple(SmemLayoutAtomA{}, SmemLayoutAtomSFA{}));
+  using SmemLayoutAtomsB = decltype(cute::make_tuple(SmemLayoutAtomB{}, SmemLayoutAtomSFB{}));
+
+  // Get stages from shared memory usage. 
+  static constexpr auto PipelineStages = detail::sm120_compute_stage_count_or_override_blockscaled_sparse<
+    detail::sm120_smem_capacity_bytes, sparse_elem<ElementAMma::sparsity, SmemAllocTypeA>,
+    SmemAllocTypeB, ElementEMma, TileShape_MNK, SmemLayoutAtomSFA, SmemLayoutAtomSFB>(StageCountType{});
+  static constexpr uint32_t PipelineStagesA = get<0>(PipelineStages);
+  static constexpr uint32_t PipelineStagesB = get<1>(PipelineStages);
+  // In normal case, when A/B with same stages, E can be kept in SMEM, with A/B stages.
+  // When A/B is with different stages, it is design for keeping E in GMEM/L2 to saving
+  //    SMEM usage. StageE is defined to 0 for this case.
+  static constexpr uint32_t PipelineStagesE = PipelineStagesA == PipelineStagesB ? PipelineStagesA : 0;
+
+  static constexpr uint32_t SchedulerPipelineStageCount = 2;
+
+  // Choose dispatch policy based on different required stages.
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparseBlockScaled<PipelineStagesA,
+                                                                          PipelineStagesB,
+                                                                          PipelineStagesE,
+                                                                          SchedulerPipelineStageCount,
+                                                                          ClusterShape_MNK>;
+
+  using StridePairB = decltype(cute::make_tuple(cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>{}, Sm1xxBlkScaledConfig::deduce_layoutSFB()));
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      cute::tuple<ElementA, ElementSF>,
+      LayoutTupleA,
+      cute::tuple<ElementB, ElementSF>,
+      StridePairB,
+      TiledMma,
+      GmemTiledCopyPairA,
+      SmemLayoutAtomsA,
+      SmemCopyAtomsA,
+      cute::identity,
+      GmemTiledCopyPairB,
+      SmemLayoutAtomsB,
+      SmemCopyAtomsB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm120_common.inl b/include/cutlass/gemm/collective/builders/sm120_common.inl
new file mode 100644
index 00000000..7915eb97
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm120_common.inl
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm1xx_common.inl"
+#include "cute/atom/mma_traits_sm120.hpp"
+#include "cute/arch/mma_sm120.hpp"
+#include "cutlass/arch/arch.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+constexpr int sm120_smem_capacity_bytes = cutlass::arch::sm120_smem_capacity_bytes;
+// Helper for selecting the shared memory copy atom to use for operand A
+template <class ElementA, class ElementB, bool UseF8f6f4>
+CUTLASS_HOST_DEVICE constexpr
+auto
+sm120_rr_smem_copy_selector_A() {
+  if constexpr (UseF8f6f4) {
+    if constexpr (sizeof_bits_v<ElementA> == 6) {
+      return SM100_SU6_DU8x16_x4_LDSM_N{};
+    }
+    else if constexpr (sizeof_bits_v<ElementA> == 4) {
+      return SM100_SU4_DU8x16_x4_LDSM_N{};
+    }
+    else {
+      return SM75_U32x4_LDSM_N{};
+    }
+  }
+  else {
+    return SM75_U32x4_LDSM_N{};
+  }
+}
+
+// Helper for selecting the shared memory copy atom to use for operand B
+template <class ElementA, class ElementB, bool UseF8f6f4>
+CUTLASS_HOST_DEVICE constexpr
+auto
+sm120_rr_smem_copy_selector_B() {
+  if constexpr (UseF8f6f4) {
+    if constexpr (sizeof_bits_v<ElementB> == 6) {
+      return SM100_SU6_DU8x16_x4_LDSM_N{};
+    }
+    else if constexpr (sizeof_bits_v<ElementB> == 4) {
+      return SM100_SU4_DU8x16_x4_LDSM_N{};
+    }
+    else {
+      return SM75_U32x4_LDSM_N{};
+    }
+  } 
+  else {
+    return SM75_U32x4_LDSM_N{};
+  }
+}
+
+template <class ElementType, class MajorSize>
+CUTLASS_HOST_DEVICE constexpr
+auto
+sm120_rr_smem_selector() {
+  static_assert(cutlass::sizeof_bits<ElementType>::value <= 8, "Unsupported element size.");
+ 
+  if constexpr      (MajorSize{} % size<1>(UMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
+    return UMMA::Layout_K_SW128_Atom<ElementType>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
+    return UMMA::Layout_K_SW64_Atom<ElementType>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
+    return UMMA::Layout_K_SW32_Atom<ElementType>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
+    return UMMA::Layout_K_INTER_Atom<ElementType>{};
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "No shared memory copy atom can be selected.");
+  }
+}
+
+template <class ElementType, class MajorSize, class Sparsity>
+CUTLASS_HOST_DEVICE constexpr
+auto
+sm120_rr_smem_selector_sparse() {
+  static_assert(cutlass::sizeof_bits<ElementType>::value <= 8, "Unsupported element size.");
+
+   if constexpr      (MajorSize{} % size<1>(UMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+    return UMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+    return UMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+    return UMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{};
+  }
+  else if constexpr (MajorSize{} % size<1>(UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+    return UMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{};
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "No shared memory copy atom can be selected.");
+  }
+}
+
+template <int SFVectorSize>
+CUTLASS_HOST_DEVICE constexpr
+auto
+sm120_tile_n_permute_selector() {
+  // VS = 16
+  if constexpr (SFVectorSize == 16) {
+    // Permute in the N mode to allow a warp to own all the elements needed for SF reduction
+    return cute::Layout<cute::Shape<_8,_2,_2>, cute::Stride<_1, _16,_8>>{};
+  }
+  // VS = 32
+  else if constexpr (SFVectorSize == 32) {
+    return cute::Layout<cute::Shape<_8,_2,_2>, cute::Stride<_1, _16,_8>>{};
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<cute::C<SFVectorSize>>,
+      "Unsupported SFVectorSize for SM120 collective builder.");
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective::detail
diff --git a/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl
new file mode 100644
index 00000000..25a06e50
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm120_mma_builder.inl
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm120_common.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm120,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      // Dense Gemm
+      (cute::is_base_of_v<KernelScheduleSm120DenseGemm, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag> ||
+       cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, BuilderScheduleTag> ||
+       cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>) &&
+      // Alignment check
+      detail::sm1xx_gemm_is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, BuilderScheduleTag>()>>
+{
+
+  static_assert(detail::is_sm10x_f8f6f4_element<ElementA>() && detail::is_sm10x_f8f6f4_element<ElementB>(),
+                "SM120 TmaWarpSpecialized builder currently only supports F8F6F4 MMA.");
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+  static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+
+  using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
+  using PermTileN = decltype(cute::min(size<1>(TileShape_MNK{}),  _32{}));
+
+  static constexpr bool IsCooperative = !cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, BuilderScheduleTag>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_4,_2,_1>>, Layout<Shape<_2,_2,_1>>>;
+
+  // Data type used by MMA instruction
+  using ElementAMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
+
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMma, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, false /*IsSparse*/>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  // Setup TiledMma
+  using TiledMma = decltype(cute::make_tiled_mma(
+    cute::rr_op_selector_sm120<ElementA, ElementB, ElementAccumulator>(),
+    AtomLayoutMNK{},
+    Tile<PermTileM, PermTileN, _32>{}
+  ));
+
+  // DType check
+  static constexpr bool UseF8f6f4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+  static_assert(UseF8f6f4, "Non-blockscaled collective builder only supports F8F6F4 MMA.\n");
+
+  // Element type
+  using SmemAllocTypeA = cute::conditional_t<UseF8f6f4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<UseF8f6f4, uint8_t, typename TiledMma::ValTypeB>;
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeA, decltype(size<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeB, decltype(size<2>(TileShape_MNK{}))>());
+
+  // Setup Stages and DispatchPolicy
+  using MainloopPipelineStorage = typename cutlass::PipelineTmaUmmaAsync<1>::SharedStorage;
+
+  static constexpr int PipelineStages = detail::sm100_compute_stage_count_or_override<
+      detail::sm120_smem_capacity_bytes, SmemAllocTypeA,
+      SmemAllocTypeB, TileShape_MNK, MainloopPipelineStorage>(StageCountType{});
+  static constexpr uint32_t SchedulerPipelineStageCount = 2;
+  using DispatchPolicy = MainloopSm120TmaWarpSpecialized<PipelineStages,
+                                                         SchedulerPipelineStageCount,
+                                                         ClusterShape_MNK,
+                                                         BuilderScheduleTag>;
+
+  using SmemCopyAtomA = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_A<ElementA, ElementB, UseF8f6f4>()), SmemAllocTypeA>;
+  using SmemCopyAtomB = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_B<ElementA, ElementB, UseF8f6f4>()), SmemAllocTypeB>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm120_sparse_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_sparse_mma_builder.inl
new file mode 100644
index 00000000..36ed318e
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm120_sparse_mma_builder.inl
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm120_common.inl"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template <
+  int CapacityBytes,
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  class TileShapeSFA,
+  class TileShapeSFB,
+  int stages
+>
+constexpr int
+sm120_compute_stage_count_or_override_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity.
+// With limited SMEM capacity, F8/F6/F4 MMA with larger tiles, such as 128x128, cannot
+// get 2 stages. Therefore, use 1.5 stages for B.
+template <
+  int CapacityBytes,
+  class ElementAMma,
+  class ElementB,
+  class ElementEMma,
+  class TileShapeMNK,
+  int carveout_bytes
+>
+constexpr auto
+sm120_compute_stage_count_or_override_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // For F6/F4 sub-bytes, ElementAMma will be passed in as sparse_elem<uint8_t, Sparsity>, and
+  //    ElementB will be passed in as uint8_t
+  // Each stage include (CollectiveMma::SharedStorage)
+  // 1. smem for A,B,E (CollectiveMma::SharedStorage::TensorStorage)
+  // 2. one MainloopPipeline = (CollectiveMma::SharedStorage::PipelineStorage = PipelineTmaSparseUmmaAsync, three arrive-wait barrier)
+  constexpr auto mainloop_pipeline_bytes = sizeof(cutlass::arch::ClusterBarrier) * 3;
+  // a_bits, e_bits already consider sparsity through `sizeof_bits(ElementAMma)
+  // NOTE: sizeof_bits<sparse_elem<>> return integral_ratio instead of size_t
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  // When stage >=2, it can be used directly.
+  constexpr int stages = (CapacityBytes - carveout_bytes) / stage_bytes;
+  if constexpr (stages >= 2) {
+    return cute::make_tuple(stages, stages);
+  }
+
+  // When stage < 2, try to use half of TileK, aka size<2>(TileShapeMNK{}),
+  //   and increase its stages. Eg. when input TileK is 256, instead of
+  //   buffering K=256 with 2 stages, it uses K=128, with 3 stages. From the
+  //   kernel's TileK view (K=256), B is 1.5 stages.
+  // Note, if B apply 1.5 stages, metadata is kept in L2, as limited SMEM capacity.
+  //  A/B is with asymmetric DMA and buffering, as they are with different
+  //  TileK and buffer advance steps.
+  constexpr int stage_bytes_b15 =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) * 3 / 4 +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  constexpr int stages_b15 = (CapacityBytes - carveout_bytes) / stage_bytes_b15;
+  if constexpr (stages_b15 >= 2) {
+    return cute::make_tuple(stages_b15, 3);
+  }
+
+  return cute::make_tuple(stages_b15, stages_b15);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class BuilderScheduleTag
+>
+struct CollectiveBuilder<
+    arch::Sm120,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    BuilderScheduleTag,
+    cute::enable_if_t<
+      (not cute::is_tuple_v<ElementA>   && not cute::is_tuple_v<ElementB> &&
+       not cute::is_complex_v<ElementA> && not cute::is_complex_v<ElementB> &&
+       not cute::is_sparse_v<ElementA>) &&
+      // Sparse Gemm
+      (cute::is_base_of_v<KernelScheduleSparseGemmSm120, BuilderScheduleTag> ||
+       cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>) &&
+      // Alignment check
+      detail::sm1xx_sparse_gemm_is_aligned<ElementA, AlignmentA, GmemLayoutATag, ElementB, AlignmentB, BuilderScheduleTag>()>>
+{
+
+  static_assert(detail::is_sm10x_f8f6f4_element<ElementA>() && detail::is_sm10x_f8f6f4_element<ElementB>(),
+                "SM120 Sparse TmaWarpSpecialized builder currently only supports F8F6F4 MMA.");
+  static_assert(cute::is_static_v<TileShape_MNK>, "TileShape has to be static");
+  static_assert(cute::is_static_v<ClusterShape_MNK>, "Cluster has to be static");
+  static_assert(cute::size(ClusterShape_MNK{}) == Int<1>{}, "no programmatic multicast on this arch");
+
+  static_assert(cute::is_same_v<GmemLayoutATag, cutlass::layout::RowMajor>, "LayoutA must be K major.");
+  static_assert(cute::is_same_v<GmemLayoutBTag, cutlass::layout::ColumnMajor>, "LayoutB must be K major.");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+
+  using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
+  using AtomLayoutMNK =  Layout<Shape<_4,_2,_1>>;
+  using PermTileN = decltype(cute::min(size<1>(TileShape_MNK{}),  _32{}));
+
+  // Data type used by MMA instruction
+  using ElementAMmaRaw = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementA>());
+  using ElementBMma = decltype(cutlass::gemm::collective::detail::sm1xx_kernel_input_element_to_mma_input_element<ElementB>());
+
+  static_assert(detail::sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement<ElementAMmaRaw, ElementBMma,
+                                                                        TileShape_MNK, ClusterShape_MNK,
+                                                                        GmemLayoutATag, GmemLayoutBTag, true /*IsSparse*/>(),
+                "TileSize and MNK Major does not met with MMA Mix 8-bit TMA load requirement" );
+
+  // Setup TiledMma
+  using TiledMma = decltype(cute::make_tiled_mma(
+    cute::rr_sparse_op_selector_sm120<ElementA, ElementB, ElementAccumulator>(),
+    AtomLayoutMNK{},
+    Tile<PermTileM, PermTileN, _64>{}
+  ));
+
+  // DType check
+  static constexpr bool UseF8f6f4 = cute::is_base_of_v<KernelScheduleSparseF8f6f4Sm120, BuilderScheduleTag> || 
+                                    cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>;
+  static_assert(UseF8f6f4, "Non-blockscaled collective builder only supports F8F6F4 MMA.\n");
+
+  // Element type
+  using SmemAllocTypeA = cute::conditional_t<UseF8f6f4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<UseF8f6f4, uint8_t, typename TiledMma::ValTypeB>;
+  using SmemAllocTypeE = typename TiledMma::FrgTypeE::raw_type;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementASparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementESparsity = Int<ElementEMma::sparsity>;
+  using ElementE = typename ElementEMma::raw_type;
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  // Setup SparseConfig
+  using Sm1xxSparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma, GmemLayoutATag, ElementEMma>;
+  using LayoutA = decltype(Sm1xxSparseConfig::deduce_layoutA());
+  using LayoutE = decltype(Sm1xxSparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}, TagToStrideA_t<GmemLayoutATag>{}));
+
+  // Setup LayoutAtom and CopyAtom
+  using SmemLayoutAtomA = decltype(detail::sm120_rr_smem_selector_sparse<SmemAllocTypeA, decltype(size<2>(TileShape_MNK{})), ElementASparsity>());
+  using SmemLayoutAtomB = decltype(detail::sm120_rr_smem_selector<SmemAllocTypeB, decltype(size<2>(TileShape_MNK{}))>());
+
+  // Setup Stages and DispatchPolicy
+  static constexpr uint32_t SchedulerPipelineStageCount = 2;
+  static constexpr auto PipelineStages = detail::sm120_compute_stage_count_or_override_sparse<
+                    detail::sm120_smem_capacity_bytes, sparse_elem<ElementAMma::sparsity, SmemAllocTypeA>, 
+                    SmemAllocTypeB, ElementEMma, TileShape_MNK>(StageCountType{});
+  static constexpr uint32_t PipelineStagesA = get<0>(PipelineStages);
+  static constexpr uint32_t PipelineStagesB = get<1>(PipelineStages);
+  // In normal case, when A/B with same stages, E can be kept in SMEM, with A/B stages.
+  // When A/B is with different stages, it is design for keeping E in GMEM/L2 to saving
+  //    SMEM usage. StageE is defined to 0 for this case.
+  static constexpr uint32_t PipelineStagesE = PipelineStagesA == PipelineStagesB ? PipelineStagesA : 0;
+
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparse<PipelineStagesA,
+                                                               PipelineStagesB,
+                                                               PipelineStagesE,
+                                                               SchedulerPipelineStageCount,
+                                                               ClusterShape_MNK>;
+
+  using SmemCopyAtomA = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_A<ElementA, ElementB, UseF8f6f4>()),
+                        sparse_elem<ElementAMma::sparsity,SmemAllocTypeA>>;
+  using SmemCopyAtomB = Copy_Atom<decltype(detail::sm120_rr_smem_copy_selector_B<ElementA, ElementB, UseF8f6f4>()), SmemAllocTypeB>;
+  using SmemCopyAtomE = Copy_Atom<UniversalCopy<uint64_t>,
+                        sparse_elem<ElementEMma::sparsity,SmemAllocTypeE>>;
+  using SmemCopyAtomAPair = decltype(make_tuple(SmemCopyAtomA{}, SmemCopyAtomE{}));
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomAPair,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm1xx_common.inl b/include/cutlass/gemm/collective/builders/sm1xx_common.inl
new file mode 100644
index 00000000..76c71f80
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm1xx_common.inl
@@ -0,0 +1,694 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp" // KernelSchedule1Sm, KernelSchedule2Sm
+#include "cutlass/gemm/collective/builders/sm90_common.inl" // detail::sm90_cluster_shape_to_tma_atom()
+#include "cutlass/numeric_types.h" // all numeric types
+#include "cutlass/detail/dependent_false.hpp" // detail::dependent_false
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/layout.hpp" // cutlass::detail::get_input_alignment_bits()
+#include "cutlass/layout/matrix.h" // cutlass::layout::RowMajor, cutlass::layout::ColumnMajor
+#include "cutlass/fast_math.h" // cutlass::round_up, cutlass::const_max
+#include "cutlass/arch/arch.h"
+
+#include "cute/atom/mma_traits_sm100.hpp" // UMMA::Layout_MN_SW*
+#include "cute/atom/copy_traits_sm100_tma.hpp" // SM100_TMA_*SM_LOAD_*
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/arch/mma_sm100_desc.hpp" // cute::UMMA::Major
+#include "cute/arch/mma_sm100_umma.hpp" // SM100_*MMA_SS_*
+#include "cute/numeric/integral_constant.hpp" // is_static_v, cute::integral_constant
+#include "cute/util/type_traits.hpp" // cute::alignment_of_v
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+// Forward Declaration
+struct KernelScheduleAuto;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Maps input element to umma element
+template <class Element, bool IsF8F6F4 = true>
+constexpr auto
+sm1xx_kernel_input_element_to_mma_input_element() {
+  if constexpr (cute::is_same_v<Element, float>) {
+    return cutlass::tfloat32_t{};
+  }
+  else if constexpr (cute::is_same_v<Element, cutlass::float_e2m1_t> && IsF8F6F4) {
+    return cutlass::detail::float_e2m1_unpacksmem_t{};
+  }
+  else if constexpr (cute::is_same_v<Element, cutlass::float_e3m2_t> && IsF8F6F4) {
+    return cutlass::detail::float_e3m2_unpacksmem_t{};
+  }
+  else if constexpr (cute::is_same_v<Element, cutlass::float_e2m3_t> && IsF8F6F4) {
+    return cutlass::detail::float_e2m3_unpacksmem_t{};
+  }
+  else if constexpr (cute::is_same_v<Element, cutlass::type_erased_dynamic_float4_t> && IsF8F6F4) {
+    return cutlass::detail::type_erased_dynamic_float4_unpacksmem_t{};
+  }
+  else if constexpr (cute::is_same_v<Element, cutlass::type_erased_dynamic_float6_t> && IsF8F6F4) {
+    return cutlass::detail::type_erased_dynamic_float6_unpacksmem_t{};
+  }
+  else {
+    return Element{};
+  }
+}
+
+// Maps 2.x A matrix layout tag to respective UMMA major mode enum
+template <class Layout>
+constexpr cute::UMMA::Major
+tag_to_umma_major_A() {
+  using LayoutA = cute::remove_pointer_t<Layout>;
+  if constexpr (cute::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
+    return cute::UMMA::Major::K;
+  }
+  else if constexpr (cute::is_same_v<LayoutA, cutlass::layout::ColumnMajor>) {
+    return cute::UMMA::Major::MN;
+  }
+  else if constexpr (cutlass::detail::is_major<0, LayoutA>()) {
+    return cute::UMMA::Major::MN;
+  }
+  else if constexpr (cutlass::detail::is_major<1, LayoutA>()) {
+    return cute::UMMA::Major::K;
+  }
+  else {
+    static_assert(sizeof(LayoutA) == 0, "Invalid layout.");
+  }
+}
+
+// Maps 2.x B matrix layout tag to respective UMMA major mode enum
+template <class Layout>
+constexpr cute::UMMA::Major
+tag_to_umma_major_B() {
+  using LayoutB = cute::remove_pointer_t<Layout>;
+  if constexpr (cute::is_same_v<LayoutB, cutlass::layout::RowMajor>) {
+    return cute::UMMA::Major::MN;
+  }
+  else if constexpr (cute::is_same_v<LayoutB, cutlass::layout::ColumnMajor>) {
+    return cute::UMMA::Major::K;
+  }
+  else if constexpr (cutlass::detail::is_major<0, LayoutB>()) {
+    return cute::UMMA::Major::MN;
+  }
+  else if constexpr (cutlass::detail::is_major<1, LayoutB>()) {
+    return cute::UMMA::Major::K;
+  }
+  else {
+    static_assert(sizeof(LayoutB) == 0, "Invalid layout.");
+  }
+}
+
+template<class BuilderScheduleTag>
+constexpr uint32_t find_vector_size() {
+  if constexpr (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized1SmNvf4Sm100> ||
+                cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized2SmNvf4Sm100> ||
+                cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100> ||
+                cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100> ||
+                cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedNvf4Sm120> ||
+                cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedPingpongNvf4Sm120>
+              ) {
+    return 16;
+  }
+  else if constexpr (cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized1SmNvf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized2SmNvf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized1SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized2SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized1SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized2SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecializedNvf4Sm120>) {           
+    return 32;
+  }
+  else if constexpr (cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized1SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized2SmMxf4Sm100> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecializedMxf8f6f4Sm120> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecializedMxf8f6f4Acc2x4Sm120> ||
+                     cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecializedMxf4Sm120>) {
+    return 64;
+  }
+  else {
+    return 32;
+  }
+}
+
+/**
+ * @brief Check for F8F6F4 alignment requirement
+ * 
+ * @tparam TileShape_MNK (MmaAtomShape_M, MmaAtomShape_N, TileShape_K)
+ * @tparam ClusterShape_MNK (cluster_M, cluster_N, cluster_K)
+ * @tparam BuilderScheduleTag Builder tag
+ */
+template<
+  class ElementAMma,
+  class ElementBMma,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class LayoutA,
+  class LayoutB,
+  bool IsSparse,
+  bool Is2sm = false
+>
+constexpr bool sm1xx_gemm_check_for_f8f6f4_mix8bit_requirement(){
+  [[maybe_unused]] constexpr int TileShape_M = Is2sm ? size<0>(TileShape_MNK{}) / 2 : size<0>(TileShape_MNK{});
+  [[maybe_unused]] constexpr int TileShape_N = size<1>(TileShape_MNK{});
+  [[maybe_unused]] constexpr int TileShape_K = size<2>(TileShape_MNK{});
+
+  constexpr bool is_b_unpack_f4_f6 = cute::is_same_v<ElementBMma, cutlass::detail::float_e2m1_unpacksmem_t> ||
+                                     cute::is_same_v<ElementBMma, cutlass::detail::float_e3m2_unpacksmem_t> ||
+                                     cute::is_same_v<ElementBMma, cutlass::detail::float_e2m3_unpacksmem_t> ||
+                                     cute::is_same_v<ElementBMma, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
+                                     cute::is_same_v<ElementBMma, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t>;
+  // For sparse, this is ElementAMmaRaw
+  constexpr bool is_a_unpack_f4_f6 = cute::is_same_v<ElementAMma, cutlass::detail::float_e2m1_unpacksmem_t> ||
+                                     cute::is_same_v<ElementAMma, cutlass::detail::float_e3m2_unpacksmem_t> ||
+                                     cute::is_same_v<ElementAMma, cutlass::detail::float_e2m3_unpacksmem_t> ||
+                                     cute::is_same_v<ElementAMma, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
+                                     cute::is_same_v<ElementAMma, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t>;
+
+  [[maybe_unused]] constexpr bool is_b_n_major = cute::is_same_v<LayoutB, cutlass::layout::RowMajor>;
+  [[maybe_unused]] constexpr bool is_b_k_major = not is_b_n_major;
+  [[maybe_unused]] constexpr bool is_a_m_major = cute::is_same_v<LayoutA, cutlass::layout::ColumnMajor>;
+  [[maybe_unused]] constexpr bool is_a_k_major = not is_a_m_major;
+
+  // 2SM
+  if constexpr (Is2sm) {
+    if constexpr (IsSparse) {
+      constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
+                                                    TileShape_K % 256 == 0:
+                                                    TileShape_M % 128 == 0);
+
+      constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ?
+                                                    TileShape_N % 256 == 0: 
+                                                    TileShape_K % 128 == 0);
+      return valid_a && valid_b;
+    }
+    else {
+      constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
+                                                    TileShape_K % 128 == 0 :
+                                                    TileShape_M % 128 == 0);
+
+      constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ?
+                                                    TileShape_N % 256 == 0: 
+                                                    TileShape_K % 128 == 0);
+      return valid_a && valid_b;
+    }
+  }
+  // 1SM
+  else {
+    if constexpr (IsSparse) {
+      constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
+                                                    TileShape_K % 256 == 0:
+                                                    TileShape_M % 128 == 0);
+
+      constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ? 
+                                                    TileShape_N % 128 == 0 : 
+                                                    TileShape_K % 128 == 0);
+      return valid_a && valid_b;
+    }
+    else {
+      constexpr bool valid_a = !is_a_unpack_f4_f6 || (is_a_k_major ?
+                                                    TileShape_K % 128 == 0 :
+                                                    TileShape_M % 128 == 0);
+
+      constexpr bool valid_b = !is_b_unpack_f4_f6 || (is_b_n_major ? 
+                                                    TileShape_N % 128 == 0 : 
+                                                    TileShape_K % 128 == 0);
+      return valid_a && valid_b;
+    }
+  }
+}
+
+template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, class BuilderScheduleTag>
+constexpr bool
+sm1xx_gemm_is_aligned() {
+  // Only support dense gemm alignment check
+  constexpr bool is_f8f6f4_subbytes = cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8;
+
+  return ((cute::sizeof_bits_v<ElementA> * AlignmentA) % cutlass::detail::get_input_alignment_bits<ElementA, is_f8f6f4_subbytes>() == 0) &&
+         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_f8f6f4_subbytes>() == 0);
+}
+
+template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, class BuilderScheduleTag>
+constexpr bool
+sm1xx_blockscaled_gemm_is_aligned() {
+  // Only support blocksscaled gemm alignment check
+  constexpr bool is_mxf8f6f4_subbytes = (cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8) &&
+                                    (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, BuilderScheduleTag> ||
+                                     cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag> );
+
+  return ((cute::sizeof_bits_v<ElementA> * AlignmentA) % cutlass::detail::get_input_alignment_bits<ElementA, is_mxf8f6f4_subbytes>() == 0) &&
+         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_mxf8f6f4_subbytes>() == 0);
+}
+
+template <class ElementA, int AlignmentA, class GmemLayoutATag, class ElementB, int AlignmentB, class BuilderScheduleTag>
+constexpr bool
+sm1xx_sparse_gemm_is_aligned() {
+  // Only support sparse gemm alignment check
+  constexpr bool is_f8f6f4_subbytes = cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8;
+  constexpr int a_k_major_compress_factor = cutlass::gemm::detail::is_k_major_A<GmemLayoutATag>() ? 2 : 1;
+
+  return ((cute::sizeof_bits_v<ElementA> * AlignmentA / a_k_major_compress_factor) % 
+          cutlass::detail::get_input_alignment_bits<ElementA, is_f8f6f4_subbytes>() == 0) &&
+         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_f8f6f4_subbytes>() == 0);
+}
+
+template <class ElementA, int AlignmentA, class GmemLayoutATag, class ElementB, int AlignmentB, class BuilderScheduleTag>
+constexpr bool
+sm1xx_blockscaled_sparse_gemm_is_aligned() {
+  // Only support blocksscaled sparse gemm alignment check
+  constexpr bool is_mxf8f6f4_subbytes = (cute::sizeof_bits_v<ElementA> < 8 || cute::sizeof_bits_v<ElementB> < 8) &&
+                                    (cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm100, BuilderScheduleTag> ||
+                                     cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag>);
+  constexpr int a_k_major_compress_factor = cutlass::gemm::detail::is_k_major_A<GmemLayoutATag>() ? 2 : 1;
+
+  return ((cute::sizeof_bits_v<ElementA> * AlignmentA / a_k_major_compress_factor) % 
+          cutlass::detail::get_input_alignment_bits<ElementA, is_mxf8f6f4_subbytes>() == 0) &&
+         ((cute::sizeof_bits_v<ElementB> * AlignmentB) % cutlass::detail::get_input_alignment_bits<ElementB, is_mxf8f6f4_subbytes>() == 0);
+}
+
+template<class CollectiveEpilogue>
+constexpr int
+compute_carveout_from_epi() {
+  constexpr int tensor_alignment = cutlass::const_max(128, cute::alignment_of_v<typename CollectiveEpilogue::TensorStorage>);
+  constexpr int pipeline_alignment = 16;
+
+  return cutlass::round_up(sizeof(typename CollectiveEpilogue::TensorStorage), tensor_alignment) +
+         cutlass::round_up(sizeof(typename CollectiveEpilogue::PipelineStorage), pipeline_alignment);
+}
+
+namespace blockscaled {
+
+enum class BlockScaledInstr {
+  MXF4_NVF4,
+  MXF4F6F8
+};
+
+template <class BuilderScheduleTag, class T>
+struct blockscaled_type {};
+
+template <class BuilderScheduleTag, class T, class SF>
+struct blockscaled_type<BuilderScheduleTag, cute::tuple<T,SF>> {
+  using sf_type = SF;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize = detail::find_vector_size<BuilderScheduleTag>();
+};
+
+template <class BuilderScheduleTag, class T, class SF, int SfVectorSize_>
+struct blockscaled_type<BuilderScheduleTag, cute::tuple<T,SF, cute::Int<SfVectorSize_>>> {
+  using sf_type = SF;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize = SfVectorSize_;
+};
+
+template <class BuilderScheduleTag, class T>
+struct blockscaled_type<BuilderScheduleTag, cutlass::mx_float6_t<T>> {
+  using sf_type = cutlass::float_ue8m0_t;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize =
+    (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ||
+     cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>) ? 64 : 32;
+};
+
+template <class BuilderScheduleTag, class T>
+struct blockscaled_type<BuilderScheduleTag, cutlass::mx_float4_t<T>> {
+  using sf_type = cutlass::float_ue8m0_t;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize =
+    (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ||
+     cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>) ? 64 : 32;
+};
+
+template <class BuilderScheduleTag, class T>
+struct blockscaled_type<BuilderScheduleTag, nv_float4_t<T>> {
+  using sf_type = cutlass::float_ue4m3_t;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize =
+    (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ||
+     cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>) ? 32 : 16;
+};
+template <class BuilderScheduleTag, class T>
+struct blockscaled_type<BuilderScheduleTag, cutlass::mx_float8_t<T>> {
+  using sf_type = cutlass::float_ue8m0_t;
+  using data_type = T;
+  static constexpr uint32_t SfVectorSize =
+    (cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag> ||
+     cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>) ? 64 : 32;
+};
+
+template <
+  class BuilderScheduleTag,
+  class ElementPairA, class ElementPairB,
+  UMMA::Major UmmaMajorA, UMMA::Major UmmaMajorB
+>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+check_input_datatypes() {
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+  constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::SfVectorSize;
+
+  auto is_auto_instr_selection_policy = [&]() {
+    return ((cute::is_same_v<BuilderScheduleTag, KernelScheduleAuto>)
+            // SM100 BS
+            || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledGemmSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized1SmBlockScaledSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecialized2SmBlockScaledSm100>)
+            // SM100 BS ptr_array
+            || (cute::is_same_v<BuilderScheduleTag, KernelSchedulePtrArrayBlockScaledGemmSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100>)
+            // SM100 BSSP
+            || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledSparseGemmSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized1SmBlockScaledSm100>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelSparseTmaWarpSpecialized2SmBlockScaledSm100>)
+            // SM120 BS
+            || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledGemmSm120>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedPingpong>)
+            || (cute::is_same_v<BuilderScheduleTag, KernelTmaWarpSpecializedCooperative>)
+            // SM120 BSSP
+            || (cute::is_same_v<BuilderScheduleTag, KernelScheduleBlockScaledSparseGemmSm120>)
+            );
+  };
+
+  static_assert(cute::is_same_v<ElementSFA, ElementSFB>, "Scale factor types for A and B should be the same.");
+  static_assert((SfVectorSizeA == SfVectorSizeB), "Scale factor vector size for A and B should be the same.");
+  if constexpr ((SfVectorSizeA == 0) || (SfVectorSizeB == 0)) {
+     static_assert(!is_auto_instr_selection_policy(), "Auto instr selection isn't valid if scale factor vector size can't be determined from the types");
+  }
+
+  static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t> 
+                || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>, "Incorrect scale factor type");
+
+    if constexpr (((sizeof_bits_v<ElementA> == 4 || sizeof_bits_v<ElementA> == 6 || sizeof_bits_v<ElementA> == 8) &&
+                   (sizeof_bits_v<ElementB> == 4 || sizeof_bits_v<ElementB> == 6 || sizeof_bits_v<ElementB> == 8)    ) &&  // A and B are 4, 6, or 8 bit types and
+                  (!(sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4)                                   )     // A and B are not both 4 bit types
+                 ) {
+      ///////////////////////////////////////////////////////////////////////
+      // Mixed Precision FP4, FP6, FP8 case. -> MX_F4F6F8 instructions
+      ///////////////////////////////////////////////////////////////////////
+      // 1. Check Scale factor data type
+      static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t>, "MX_F4F6F8 only supports ue8m0 SF type");
+      // 2. Check whether A and B type combinations are valid or not
+      static_assert(
+        ( // If runtime datatypes are used, then both A and B should be runtime data type
+          (
+           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float8_t> ||
+           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float6_t> ||
+           cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t>
+          ) &&
+          (
+           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float8_t> ||
+           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float6_t> ||
+           cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>
+          )
+        ) ||
+        ( // Valid (explicit) A and B type pairs
+          (
+           cute::is_same_v<ElementA, cutlass::float_e2m1_t> ||
+           cute::is_same_v<ElementA, cutlass::float_e2m3_t> ||
+           cute::is_same_v<ElementA, cutlass::float_e3m2_t> ||
+           cute::is_same_v<ElementA, cutlass::float_e4m3_t> ||
+           cute::is_same_v<ElementA, cutlass::float_e5m2_t> 
+          ) &&
+          (
+           cute::is_same_v<ElementB, cutlass::float_e2m1_t> ||
+           cute::is_same_v<ElementB, cutlass::float_e2m3_t> ||
+           cute::is_same_v<ElementB, cutlass::float_e3m2_t> ||
+           cute::is_same_v<ElementB, cutlass::float_e4m3_t> ||
+           cute::is_same_v<ElementB, cutlass::float_e5m2_t> 
+          )
+        ), "Incorrect types for A and B for MX_F4F6F8"
+      );
+      // 3. Check Scale factor vector size is valid. 
+      //   SfVectorSize = 32 for blockscaled dense gemm and ptr array blockscaled dense gemm
+      //   SfVectorSize = 64 for blockscaled sparse gemm
+      static_assert(
+        ((SfVectorSizeA == 32 && cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)
+      || (SfVectorSizeA == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSizeA == 32 && cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSizeA == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSizeA == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm120, BuilderScheduleTag>)
+      || (SfVectorSizeA == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>)
+        ), "Incorrect SfVectorSize for MX_F4F6F8 is deduced.");
+
+      // 4. Check the kernel policy. Kernel policy should be either auto or *MXf8f6f4*
+      static_assert((cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag>
+                  || is_auto_instr_selection_policy()), "Incorrect Kernel Schedule Policy for Mx_F4F6F8 type inputs.");
+
+      return true;
+    }
+    else if constexpr ((sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4)) {
+      ///////////////////////////////////////////////////////////////////////
+      // A and B are both 4 bit types
+      // There are multiple block scaled tcgen05.mma instructions supporting F4 types.
+      ///////////////////////////////////////////////////////////////////////
+
+      // 1. Check Scale factor data type
+      static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t> 
+                      || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>
+                      , "MXNV_F4 supports ue8m0 and ue4m3 SF types");
+      // 2. Check whether A and B type combinations are valid or not
+      static_assert(
+         ( // If runtime datatypes are used, then both A and B should be runtime data type
+          cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> && 
+          cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>
+         ) ||
+         ( // Valid (explicit) A and B type pairs
+          (
+            cute::is_same_v<ElementA, cutlass::float_e2m1_t>
+          ) &&
+          (
+            cute::is_same_v<ElementB, cutlass::float_e2m1_t>
+          )
+         ), "Incorrect types for A and B for MXNV_F4");
+        // 3. Skip checking the scale factor vector size. Will be checked later for specific Kernel Schedule policies.
+        // 4. Check the kernel policy.
+        static_assert((cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, BuilderScheduleTag>          ||
+                       cute::is_base_of_v<KernelScheduleMxNvf4Sm100, BuilderScheduleTag>            ||
+                       cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, BuilderScheduleTag>  ||
+                       cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, BuilderScheduleTag>    ||
+                       cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm100, BuilderScheduleTag>    ||
+                       cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm100, BuilderScheduleTag>      ||
+                       cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag>          ||
+                       cute::is_base_of_v<KernelScheduleMxNvf4Sm120, BuilderScheduleTag>            ||
+                       cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag>    ||
+                       cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm120, BuilderScheduleTag>      ||
+                       is_auto_instr_selection_policy()), "Incorrect Kernel Schedule Policy for F4 type inputs.");
+
+        // If a policy is specified, do more checks
+        if constexpr (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, BuilderScheduleTag>
+                   || cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, BuilderScheduleTag>
+                   || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm100, BuilderScheduleTag>
+                   || cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag>
+                   || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag>) {
+          // Perform additional checks. Only subset of FP4 and scale factor types are supported.
+          static_assert(cute::is_same_v<ElementSFA, cutlass::float_ue8m0_t>, "MX_F4F6F8 only supports ue8m0 SF type");
+          static_assert((cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> &&
+                         cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>) ||
+                        (cute::is_same_v<ElementA, cutlass::float_e2m1_t> &&
+                         cute::is_same_v<ElementB, cutlass::float_e2m1_t>), "Incorrect types for A and B for MX_F4F6F8");
+          static_assert(detail::find_vector_size<BuilderScheduleTag>() == SfVectorSizeA,
+                        "Kernel Schedule policy doesn't match the scale factor vector size.");
+          return true;
+        }
+        else if constexpr (cute::is_base_of_v<KernelScheduleMxNvf4Sm100, BuilderScheduleTag>
+                        || cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, BuilderScheduleTag>
+                        || cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm100, BuilderScheduleTag>
+                        || cute::is_base_of_v<KernelScheduleMxNvf4Sm120, BuilderScheduleTag>
+                        || cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm120, BuilderScheduleTag>) {
+            static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "MX/NV_F4 only supports RowMajor A, and ColMajorB");
+            static_assert(detail::find_vector_size<BuilderScheduleTag>() == SfVectorSizeA,
+                          "Kernel Schedule policy doesn't match the scale factor vector size.");
+          return true;
+        }
+        else { // auto policy
+          // If the scale factor type is ue4m3 or the scale factor vector size is 16 -> only MXF4_NVF4 instruction can support it
+          // For MXF4_NVF4, the layouts should be RowMajor A, and ColMajorB
+          static_assert(is_auto_instr_selection_policy(), "Kernel Schedule policy should be auto");
+          if constexpr (SfVectorSizeA == 16 || SfVectorSizeB == 16
+                        || cute::is_same_v<ElementSFA, cutlass::float_ue4m3_t>
+                       ) { // Only MXF4NVF4 can support these types
+            static_assert((UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K), "NV_F4 only supports RowMajor A, and ColMajorB");
+            return true;
+          }
+          return true;
+        }
+    }
+    else {
+      return false;
+    }
+  return false;
+}
+
+template <
+  class TileShape_MNK, // (MmaAtomShape_M, MmaAtomShape_N, CtaTileShapeK)
+  class ClusterShape_MNK,
+  class BuilderScheduleTag
+>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_2sm() {
+  // 2SM kernel schedule is requested
+  if constexpr (cute::is_base_of_v<KernelSchedule2Sm, BuilderScheduleTag>) { return true; }
+  // 1SM kernel schedule is requested
+  else if constexpr (cute::is_base_of_v<KernelSchedule1Sm, BuilderScheduleTag>) { return false; }
+  // auto schedule is used.
+  else {
+    if constexpr (!cute::is_static_v<ClusterShape_MNK>) {
+      // If the cluster shape is dynamic, we can't guarantee 2x1. Default to 1sm.
+      // If tile shape M is 256, throw an error. M=256 is only supported by 2SM instructions.
+      static_assert(get<0>(TileShape_MNK{}) != 256, "If M=256, auto policy can't create 2sm kernels. Specify a 2SM policy");
+      return false;
+    }
+    else if constexpr (cute::is_static_v<ClusterShape_MNK> && cute::get<0>(ClusterShape_MNK{}) % 2 == 0) {
+      // We need to check the TileShape
+      if constexpr (get<0>(TileShape_MNK{}) == 256) {
+        return true;
+      }
+      else if constexpr (get<0>(TileShape_MNK{}) == 128) {
+        return false;
+      }
+      else {
+        static_assert(get<0>(TileShape_MNK{}) == 0, "Unsupported M dimension for TileShape_MNK.");
+      }
+    }
+    else { return false;}
+  }
+}
+
+template <
+  class ElementPairA,
+  class ElementPairB,
+  class ElementAccumulator,
+  UMMA::Major UmmaMajorA,
+  UMMA::Major UmmaMajorB,
+  class BuilderScheduleTag
+>
+CUTLASS_HOST_DEVICE
+static constexpr auto
+select_instr() {
+  using ElementSFA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::sf_type;
+  using ElementSFB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::sf_type;
+  using ElementA = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::data_type;
+  using ElementB = typename detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::data_type;
+  constexpr uint32_t SfVectorSizeA = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairA>::SfVectorSize;
+  constexpr uint32_t SfVectorSizeB = detail::blockscaled::blockscaled_type<BuilderScheduleTag, ElementPairB>::SfVectorSize;
+  constexpr int SfVectorSize = SfVectorSizeA > SfVectorSizeB ? SfVectorSizeA : SfVectorSizeB;
+  using ElementSF = ElementSFA;
+
+  if constexpr (cute::is_base_of_v<KernelScheduleMxf8f6f4Sm100, BuilderScheduleTag>
+             || cute::is_base_of_v<KernelSchedulePtrArrayMxf8f6f4Sm100, BuilderScheduleTag>
+             || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm100, BuilderScheduleTag>
+             || cute::is_base_of_v<KernelScheduleMxf8f6f4Sm120, BuilderScheduleTag>
+             || cute::is_base_of_v<KernelScheduleSparseMxf8f6f4Sm120, BuilderScheduleTag>) {
+    return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+  }
+  else if constexpr (cute::is_base_of_v<KernelScheduleMxNvf4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelSchedulePtrArrayMxNvf4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm100, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleMxNvf4Sm120, BuilderScheduleTag>
+                  || cute::is_base_of_v<KernelScheduleSparseMxNvf4Sm120, BuilderScheduleTag>) {
+    return detail::blockscaled::BlockScaledInstr::MXF4_NVF4;
+  }
+  else {
+    // Auto scheduling
+    if constexpr ((sizeof_bits_v<ElementA> >= 6 && sizeof_bits_v<ElementA> <= 8) &&
+                  (sizeof_bits_v<ElementB> >= 6 && sizeof_bits_v<ElementB> <= 8)) {
+      // These types can only be supported by MX_F8F6F4 instruction
+      static_assert(
+         (SfVectorSize == 32 && cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSize == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag>
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm120, BuilderScheduleTag>)
+      || (SfVectorSize == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>)
+        ), "Incorrect SfVectorSize for MX_F4F6F8 is deduced.");
+      return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+    }
+    else if constexpr (( sizeof_bits_v<ElementA> == 4 && (sizeof_bits_v<ElementB> == 6 || sizeof_bits_v<ElementB> == 8)) ||
+                      ((sizeof_bits_v<ElementA> == 6 || sizeof_bits_v<ElementA> == 8) && sizeof_bits_v<ElementB> == 4)) {
+      // Fp4 can be mixed with FP6, Fp8 with MMA.MXF8F6F4 only
+      return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+    }
+    else if constexpr (sizeof_bits_v<ElementA> == 4 && sizeof_bits_v<ElementB> == 4) {
+      // Both A and B are 4bits
+      if constexpr (UmmaMajorA == UMMA::Major::K && UmmaMajorB == UMMA::Major::K) {
+        // MXF4_NVF4 possible
+        return detail::blockscaled::BlockScaledInstr::MXF4_NVF4;
+      }
+      else {
+        static_assert(
+        ((SfVectorSize == 32 && cute::is_same_v<KernelScheduleAuto, BuilderScheduleTag>)
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelSchedulePtrArrayBlockScaledGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSize == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm100, BuilderScheduleTag>)
+      || (SfVectorSize == 32 && cute::is_base_of_v<KernelScheduleBlockScaledGemmSm120, BuilderScheduleTag>)
+      || (SfVectorSize == 64 && cute::is_base_of_v<KernelScheduleBlockScaledSparseGemmSm120, BuilderScheduleTag>)
+          ), "Incorrect SfVectorSize for MX_F4F6F8 is deduced.");
+
+        static_assert(cute::is_same_v<ElementSF, cutlass::float_ue8m0_t> &&
+                      (cute::is_same_v<ElementA, cutlass::float_e2m1_t> &&
+                       cute::is_same_v<ElementB, cutlass::float_e2m1_t> ||
+                       cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float4_t> &&
+                       cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float4_t>),
+                      "Only MXF4 support with non-TN and MMA.MXF8F6F4.");
+        return detail::blockscaled::BlockScaledInstr::MXF4F6F8;
+      }
+    }
+  }
+}
+
+} // namespace blockscaled
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/builders/sm1xx_sparse_config.inl b/include/cutlass/gemm/collective/builders/sm1xx_sparse_config.inl
new file mode 100644
index 00000000..2afe0995
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm1xx_sparse_config.inl
@@ -0,0 +1,311 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/config.hpp"                     // CUTE_STATIC_ASSERT
+#include "cute/layout.hpp"                     // cute::Layout, cute::Shape, cute::Stride
+#include "cute/numeric/integral_constant.hpp"  // cute::Int
+#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
+#include "cute/pointer_sparse.hpp"             // cute::is_sparse
+#include "cute/util/type_traits.hpp"           // cute::is_same_v, cute::conditional_t
+#include "cutlass/fast_math.h"                 // cutlass::round_up
+#include "cutlass/layout/matrix.h"             // cutlass::layout::RowMajor
+
+namespace cutlass {
+
+using namespace cute;
+
+template<
+  class ElementAMma_,
+  class LayoutA,
+  class ElementEMma_
+>
+struct Sm1xxGemmSparseConfig {
+  /// ElementAMma Check
+  static_assert(cute::is_sparse<ElementAMma_>::value, "ElementAMma MUST be sparse elem");
+  static_assert(cute::is_sparse<ElementEMma_>::value, "ElementEMma MUST be sparse elem");
+
+  /// A
+  using ElementAMma         = ElementAMma_;
+  using ElementAMmaRaw      = typename ElementAMma::raw_type;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+
+  /// MetaData (E)
+  using ElementEMma         = ElementEMma_;
+  using ElementEMmaRaw      = typename ElementEMma::raw_type;
+  using ElementEMmaSparsity = Int<ElementEMma::sparsity>;
+
+  /// Instruction Type
+  static constexpr bool IsF4 =(cute::is_same_v<ElementAMmaRaw, uint8_t> && ElementAMmaSparsity{} == _4{}) &&
+                              (cute::is_same_v<ElementEMmaRaw, uint8_t> && ElementEMmaSparsity{} == _16{});
+  static constexpr bool IsF8F6F4 =(cute::is_same_v<ElementAMmaRaw, detail::float_e2m1_unpacksmem_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, detail::type_erased_dynamic_float4_unpacksmem_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, detail::float_e3m2_unpacksmem_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, detail::float_e2m3_unpacksmem_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, detail::type_erased_dynamic_float6_unpacksmem_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, float_e2m1_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, type_erased_dynamic_float4_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, float_e3m2_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, float_e2m3_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, type_erased_dynamic_float6_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, float_e4m3_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, float_e5m2_t> && ElementAMmaSparsity{} == _2{} ||
+                                   cute::is_same_v<ElementAMmaRaw, type_erased_dynamic_float8_t> && ElementAMmaSparsity{} == _2{}) &&
+                                  (cute::is_same_v<ElementEMmaRaw, uint8_t> && ElementEMmaSparsity{} == _8{});
+  static constexpr bool IsI8   =(cute::is_same_v<ElementAMmaRaw, int8_t> && ElementAMmaSparsity{} == _2{}) &&
+                                (cute::is_same_v<ElementEMmaRaw, uint8_t> && ElementEMmaSparsity{} == _8{});
+  static constexpr bool IsF16BF16 =(cute::is_same_v<ElementAMmaRaw, half_t> && ElementAMmaSparsity{} == _2{} ||
+                                    cute::is_same_v<ElementAMmaRaw, bfloat16_t> && ElementAMmaSparsity{} == _2{}) &&
+                                   (cute::is_same_v<ElementEMmaRaw, uint8_t> && ElementEMmaSparsity{} == _8{});
+  static constexpr bool IsTF32 =(cute::is_same_v<ElementAMmaRaw, tfloat32_t> && ElementAMmaSparsity{} == _2{} || 
+                                 cute::is_same_v<ElementAMmaRaw, float> && ElementAMmaSparsity{} == _2{}) &&
+                                (cute::is_same_v<ElementEMmaRaw, uint8_t> && ElementEMmaSparsity{} == _4{});
+  static_assert(int(IsF4) + int(IsF8F6F4) + int(IsI8) + int(IsF16BF16) + int(IsTF32) == 1, "Ambiguous Input Type Config (failed to choose Instruction type)");
+  static constexpr bool IsF4F6 = IsF8F6F4 && not 
+                                    (cute::is_same_v<ElementAMmaRaw, float_e4m3_t> && ElementAMmaSparsity{} == _2{} ||
+                                     cute::is_same_v<ElementAMmaRaw, float_e5m2_t> && ElementAMmaSparsity{} == _2{} ||
+                                     cute::is_same_v<ElementAMmaRaw, type_erased_dynamic_float8_t> && ElementAMmaSparsity{} == _2{});
+
+  /// Number of ElementARaw stored in ElementAMmaRaw
+  using ElemsARawPerElementAMmaRaw = cute::conditional_t<IsF4, _2, _1>;
+
+  /// ElementA Sparsity Ratio
+  using ElementASparsity = _2;
+  static_assert(ElementASparsity{} == _2{}, "ElementASparsity must be 2 for Blackwell Sparse Gemm");
+
+  // Logical/Physical ElementA per Chunk
+  using LogicalElemsAPerChunk = cute::conditional_t<IsF4, _8,
+                                cute::conditional_t<IsTF32, _2,
+                                _4>>;
+  using PhysicalElemsAPerChunk = Int<LogicalElemsAPerChunk{} / ElementASparsity{}>;
+
+  /// Metadata Bits
+  using ElementEBitsPerChunk = _4;
+  using ElementEBitsPerElementAMma = cute::conditional_t<IsTF32, _4, _2>;
+
+  /// Metadata Layout
+  using TensorEAtom_MMA_F4 = Layout<Shape<_128,  _256>,
+                                  Stride<_256, _1>>;
+
+  using TensorEAtom_MMA_S8orF8F6F4 = Layout<Shape<_128, _128>,
+                                     Stride<_128, _1>>;
+
+  using TensorEAtom_MMA_F16 =  Layout<Shape <Shape <  _8, _2,   _8>, Shape <_16,   _2, _4>>,
+                                   Stride<Stride<_128,_16,_2048>, Stride< _1,_1024,_32>>>;
+
+  using TensorEAtom_MMA_TF32 =  Layout<Shape <Shape < _8,_2,   _8>, Shape <_8,  _2,_4>>,
+                                    Stride<Stride<_64,_8,_1024>, Stride<_1,_512,_16>>>;
+
+  using TensorEAtom = cute::conditional_t<(IsF8F6F4 || IsI8), TensorEAtom_MMA_S8orF8F6F4,
+                      cute::conditional_t<IsF4, TensorEAtom_MMA_F4,
+                      cute::conditional_t<IsTF32, TensorEAtom_MMA_TF32,
+                      TensorEAtom_MMA_F16>>>;
+
+  // Logical elems that construct the atomK for tensorE/A.  
+  using TensorEAtomK = Int<size<1>(TensorEAtom{})>;
+  using TensorEAtomM = Int<size<0>(TensorEAtom{})>;
+
+  /// TensorA TensorE Alignment Requirements
+  using TensorEAlignmentM = TensorEAtomM;
+  using TensorEAlignmentK = TensorEAtomK;
+  static_assert(TensorEAlignmentK{} / ElementEMmaSparsity{} == _16{}, "TensorE must be 16B aligned in the K-dim.");
+
+  // When A is MN major, TensorAAlignmentK needs to be multiplier of chunk size
+  // When A is K major, TensorAAlignmentK needs to be multiplier of TMA requirements times tensorA sparsity
+  //   this is b.c. TensorACompressed needs to satisfy TMA requirements.
+  //   (LogicalElemsAPerChunk is always smaller than TMA in this case.)
+  // NOTE: TensorAAlignmentK already contains the 2x sparsity factor when k-major
+  static constexpr bool IsKMajor = cute::is_same_v<LayoutA, cutlass::layout::RowMajor>;
+  using TensorAAlignmentK = cute::conditional_t<not IsKMajor,
+                                                LogicalElemsAPerChunk,
+                                                cute::conditional_t<IsF4F6,
+                                                                    Int<128 * ElementASparsity{}>,
+                                                                    Int<128 / cute::sizeof_bits_v<ElementAMma>>>>;
+
+  // When A is MN Major, TensorAAlignmentM needs to be multiplier of TMA requirements
+  // When A is K Major, no requirements on TensorAAlignmentM.
+  using TensorAAlignmentM = cute::conditional_t<not IsKMajor,
+                                                cute::conditional_t<IsF4F6,
+                                                         Int<128>,
+                                                         Int<128 / cute::sizeof_bits_v<ElementAMmaRaw> * ElemsARawPerElementAMmaRaw{}>>,
+                                                _1>;
+
+  // The following two functions are provided for users to determine the static layout types
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA() {
+    using LayoutMMajor = Layout<Shape<int32_t,
+                                      Shape<ElementASparsity, int32_t>,
+                                      int32_t>,
+                                Stride<ElementASparsity,
+                                       Stride<_1, int64_t>,
+                                       int64_t>>;
+
+    using LayoutKMajor = Layout<Shape<int32_t,
+                                      Shape<ElementASparsity, int32_t>,
+                                      int32_t>,
+                                Stride<int64_t,
+                                       Stride<_1, ElementASparsity>,
+                                       int64_t>>;
+
+    if constexpr (IsKMajor) {
+      return LayoutKMajor{};
+    }
+    else {
+      return LayoutMMajor{};
+    }
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutE() {
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(0)),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(0)),
+                 int32_t(0)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(0)),
+                  int64_t(0))
+    );
+  }
+
+  // This function is used to revert a CuTe layout to a Cutlass layout tag (RowMajor/ColumnMajor)
+  template <
+    typename ShapeA,
+    typename StrideA
+  >
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA_tag(Layout<ShapeA, StrideA> layout_a) {
+    /*
+      (m, (2, k/2), l) : (2, (1, m*2), m*k) M-major
+      (m, (2, k/2), l) : (k, (1, 2), m*k) K-major
+    */
+    // Check if the given layout_a is possibly a sparse tensorA layout.
+    static_assert(rank_v<ShapeA> == 3 && depth_v<ShapeA> == 2, "Rank and depth mismatch with the sparse tensorA's layout.");
+    static_assert(rank(get<1>(ShapeA{})) == 2 && rank(flatten(ShapeA{})) == 4,
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<1,0>(StrideA{}) == 1 && get<1,0>(ShapeA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<0>(StrideA{}) == ElementASparsity{} || get<1,1>(StrideA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+
+    if constexpr (get<0>(StrideA{}) == ElementASparsity{}) {
+      return cutlass::layout::ColumnMajor{};
+    }
+    else {
+      return  cutlass::layout::RowMajor{};
+    }
+  }
+
+  // The following two functions are provided for user fill dynamic problem size to the layout_a/e.
+  template <
+    class ProblemShape
+  >
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutA(ProblemShape problem_shape) {
+    // * Purpose of this function
+    // This function is sparse gemm equivalent of 
+    //
+    //   ```cpp
+    //   using LayoutATag = cutlass::layout::RowMajor;
+    //   using StrideA = cutlass::gemm::TagToStrideA_t<GmemLayoutBTag>; // ( cute::Stride<int64_t, cute::Int<1>, int64_t> )
+    //   auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); // (M, cute::Int<1>, L)
+    //   auto layout_a = cute::make_layout(cute::make_shape(M, K, L), stride_a);
+    //   ```
+    //
+    // Unlike dense gemm where we can simply call `TagToStrideA_t` resp. `make_cute_packed_stride`
+    // to get the shape and stride, sparse gemm needs to consider the cute::sparse_elem<> representation.
+    // Thus, it's easier to construct the layout directly.
+    //
+    // * NOTE
+    // 1. Returned layout should be used with `cute::sparse_elem<>` pointer, instead of raw element A ptr
+    // 2. `TensorAAlignmentK` already include 2x sparsity factor along K dim.
+    //    e.g. MMA FP4 A is K major, TensorAAlignmentK = 256. When used with `cute::sparse_elem<2, f4>`, TensorAComp will
+    //         have 128 element alignment along k-dim
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorA Alignment requirement
+    const auto M_AlignedAC = cutlass::round_up(M, TensorAAlignmentM{});
+    const auto K_AlignedAC = cutlass::round_up(K, TensorAAlignmentK{});
+
+    if constexpr (IsKMajor) {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC / ElementASparsity{})),
+                   int32_t(L)),
+        make_stride(int64_t(K_AlignedAC),
+                    make_stride(_1{}, ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+    else {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC) / ElementASparsity{}),
+                   int32_t(L)),
+        make_stride(ElementASparsity{},
+                    make_stride(_1{}, int64_t(M_AlignedAC) * ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+  }
+
+  template <
+    class ProblemShape
+  >
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutE(ProblemShape problem_shape) {
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorEAlignment requirement
+    const auto M_AlignedE = cutlass::round_up(M, TensorEAlignmentM{});
+    const auto K_AlignedE = cutlass::round_up(K, TensorEAlignmentK{});
+
+    // TensorEAtom first along m-dim, then along k-dim, then along batch
+    static_assert(TensorEAlignmentM{} == TensorEAtomM{}, "below shape assume tensorEAlignmentM eq TensorEAtomM");
+    static_assert(TensorEAlignmentK{} == TensorEAtomK{}, "below shape assume tensorEAlignmentK eq TensorEAtomK");
+
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(M_AlignedE / TensorEAtomM{})),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(K_AlignedE / TensorEAtomK{})),
+                 int32_t(L)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(M_AlignedE * TensorEAtomK{})),
+                  (L == 1) ? int64_t(0) : int64_t(M_AlignedE * K_AlignedE))
+    );
+  }
+};
+
+} // namespace cutlass
diff --git a/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
index d6702930..09da42cb 100644
--- a/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
@@ -79,7 +79,7 @@ struct Sm90GemmSparseConfig {
                                  cute::is_same_v<ElementAMmaRaw, bfloat16_t> && ElementAMmaSparsity{} == _2{};
   static constexpr bool IsTF32 = cute::is_same_v<ElementAMmaRaw, tfloat32_t> && ElementAMmaSparsity{} == _2{} || 
                                   cute::is_same_v<ElementAMmaRaw, float> && ElementAMmaSparsity{} == _2{};
-  static_assert(int(IsF8) + int(IsI8) + int(IsF16BF16) + int(IsTF32) == 1, "Ambigious Input Type Config (failed to choose MMA type)");
+  static_assert(int(IsF8) + int(IsI8) + int(IsF16BF16) + int(IsTF32) == 1, "Ambiguous Input Type Config (failed to choose MMA type)");
 
   // Number of ElementARaw stored in ElementAMmaRaw. For Hopper this is always 1.
   using ElemsARawPerElementAMmaRaw = _1;
diff --git a/include/cutlass/gemm/collective/collective_builder.hpp b/include/cutlass/gemm/collective/collective_builder.hpp
index dd139c28..d07aca2e 100644
--- a/include/cutlass/gemm/collective/collective_builder.hpp
+++ b/include/cutlass/gemm/collective/collective_builder.hpp
@@ -42,8 +42,14 @@
 #if !defined(__CUDACC_RTC__) 
 #include "cutlass/gemm/collective/builders/sm100_umma_builder.inl"              
 #include "cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl"       
+#include "cutlass/gemm/collective/builders/sm100_sparse_umma_builder.inl"
 #include "cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl"  
 #include "cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl"
-#endif 
+#include "cutlass/gemm/collective/builders/sm100_blockscaled_sparse_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_sparse_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_blockscaled_sparse_mma_builder.inl"
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index 792bde60..4b4a105e 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -51,15 +51,23 @@
 #include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp"
 #include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
 #include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
-
 #if !defined(__CUDACC_RTC__)
 #include "cutlass/gemm/collective/sm100_mma_warpspecialized.hpp"
 #include "cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp"
 #include "cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp"
 #include "cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp"
-
+#include "cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp"
 #include "cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp" 
 #include "cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp" 
 #include "cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp"
+#include "cutlass/gemm/collective/sm120_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_sparse_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp"
 #endif // !defined(__CUDACC_RTC__) 
+
+
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
index 808e4495..bdc877bd 100644
--- a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
@@ -128,13 +128,15 @@ struct CollectiveMma<
                        "Static cluster shape used: TileShape should be evenly divided by TiledMma");
 
   using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
-  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
-      "Cta N should be one of 128/192/256");
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 64 or
+      shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 64/128/192/256");
 
   using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
-  using Sm100BlkScaledConfig = cutlass::detail::Sm100BlockScaledConfig<SFVecSize>;
-  using Blk_MN = typename Sm100BlkScaledConfig::Blk_MN;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
   static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
   static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
   // Tile shape used for partitioning Scale Factor B.
   // The M-dim does not affect the SFB, so just set it as the original TileShape;
@@ -444,8 +446,8 @@ struct CollectiveMma<
       // Strides for Grouped Gemm will be replaced prior to the first access regardless.
       stride_a = InternalStrideA{};
       stride_b = InternalStrideB{};
-      layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
-      layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
     }
     else {
       // Tensor shapes for Ptr-Array are initialized correctly only here.
@@ -690,6 +692,14 @@ struct CollectiveMma<
                                       make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
         return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
       }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp), 
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp), 
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
       else {
         return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
       }
@@ -964,11 +974,17 @@ struct CollectiveMma<
       if constexpr (IsCtaN192) {
         // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
         auto tCtSFB_tmp = tCtSFB;
-        if (get<1>(cta_tile_coord) % 2 == 1) {
+        if (size<1>(cta_tile_coord) % 2 == 1) {
           tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
         }
         return tCtSFB_tmp;
       }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
       else {
         return tCtSFB;
       }
@@ -1251,7 +1267,8 @@ struct CollectiveMma<
     cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
   }
 
-private:
+protected:
+
   typename Params::TMA_A const* observed_tma_load_a_{nullptr};
   typename Params::TMA_B const* observed_tma_load_b_{nullptr};
   typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
index fec954a5..69b33179 100644
--- a/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
@@ -117,7 +117,7 @@ struct CollectiveMma<
                                         Layout<Shape<_1,_1,_1>>,
                                         Tile<Underscore,Underscore,Underscore>>;
 
-  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>; 
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
   static constexpr int SFVecSize = TiledMma::SFVecSize;
   static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
 
@@ -125,13 +125,15 @@ struct CollectiveMma<
                        "Static cluster shape used: TileShape should be evenly divided by TiledMma");
 
   using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
-  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
-      "Cta N should be one of 128/192/256");
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 64 or 
+      shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 64/128/192/256");
 
   using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
-  using Sm100BlkScaledConfig = cutlass::detail::Sm100BlockScaledConfig<SFVecSize>;
-  using Blk_MN = typename Sm100BlkScaledConfig::Blk_MN;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
   static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
   static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
   // Tile shape used for partitioning Scale Factor B.
   // The M-dim does not affect the SFB, so just set it as the original TileShape;
@@ -162,7 +164,6 @@ struct CollectiveMma<
   using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
 
   static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
-
   static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
 
   static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
@@ -606,8 +607,8 @@ struct CollectiveMma<
     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
 
     // Check for SFA SFB layout requirement
-    const auto layout_sfa_ref = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
-    const auto layout_sfb_ref = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    const auto layout_sfa_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    const auto layout_sfb_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
     implementable = implementable && (layout_sfa_ref == args.layout_SFA);
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFA mismatch, layout_SFA needs to be K-major\n");
@@ -725,6 +726,14 @@ struct CollectiveMma<
                                       make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
         return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
       }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp), 
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp), 
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
       else {
         return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
       }
@@ -733,7 +742,6 @@ struct CollectiveMma<
     Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
     Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
 
-
     // Partition for this CTA
     ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
 
@@ -963,11 +971,17 @@ struct CollectiveMma<
       if constexpr (IsCtaN192) {
         // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
         auto tCtSFB_tmp = tCtSFB;
-        if (get<1>(cta_tile_coord) % 2 == 1) {
+        if (size<1>(cta_tile_coord) % 2 == 1) {
           tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
         }
         return tCtSFB_tmp;
       }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
       else {
         return tCtSFB;
       }
@@ -1004,11 +1018,10 @@ struct CollectiveMma<
         copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
       }
 
-      if constexpr (IsOverlappingAccum) {
-        if (is_first_iter) {
-          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
-          is_first_iter = false;
-        }
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      if (is_first_iter) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        is_first_iter = false;
       }
 
       // Unroll the K mode manually so we can set scale C to 1
@@ -1023,14 +1036,14 @@ struct CollectiveMma<
             accumulators);
         tiled_mma.accumulate_ = UMMA::ScaleOut::One;
       }
-
       mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
     }
 
     return mainloop_pipe_consumer_state;
   }
 
-private:
+protected:
+
   typename Params::TMA_A const* observed_tma_load_a_{nullptr};
   typename Params::TMA_B const* observed_tma_load_b_{nullptr};
   typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp
new file mode 100644
index 00000000..8ef3d8f5
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp
@@ -0,0 +1,1279 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class LayoutPairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    LayoutPairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  // CtaK needs to be multiplier of SFAtomK
+  using SfAtom = typename Sm1xxBlkScaledConfig::SfAtom;
+  using SfAtomK = cute::Int<cute::size<1>(SfAtom{})>;
+  static_assert( shape<2>(CtaShape_MNK{}) % SfAtomK{} == 0, "CtaK needs to be multiplier of SFAtomK");
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  static_assert(get<0,0>(MmaShapeA_MK{}) == 128 &&
+                (get<2>(MmaShapeA_MK{}) == 2 || get<2>(MmaShapeA_MK{}) == 4),
+                "This kernel only support MmaShape=128 and 2/4 kphase.");
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using LayoutPairA = LayoutPairA_;
+  using StridePairB = StridePairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A, B, and E matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairA{}))>;
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairA{}))>;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<2>(LayoutPairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) || (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+  static constexpr int ElementASparsity = 2; // typename SparseConfig::ElementASparsity{};
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using MainloopPipeline = cutlass::PipelineTmaSparseUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static constexpr int UtccpReuseCnt = ((size<2>(TileShape{}) / typename SparseConfig::TensorEAtomK{}) == 0) ?
+                                        typename SparseConfig::TensorEAtomK{} / size<2>(TileShape{}) : 1;
+  static_assert(UtccpReuseCnt == 1 || UtccpReuseCnt == 2, "UTCCP reuse count can only be either one or two");
+  // (TileM, TileN, TileK) TileK is adjusted according to the reuse.
+  using TileShapeE = decltype(replace<2>(TileShape{}, cute::lcm(size<2>(TileShape{}), typename SparseConfig::TensorEAtomK{})));
+  using MmaShapeE_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShapeE{}), size<2>(TileShapeE{}))));
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide the tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE) that one UTCCP instruction can provide
+  using SmemLayoutE = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomE{},
+      append(MmaShapeE_MK{}, Int<DispatchPolicy::Stages>{})));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::sparse_elem<ElementASparsity,
+                                                cute::conditional_t<IsF8F6F4, ElementAMmaRaw, ElementA>>;
+  using TmaInternalElementB = cute::conditional_t<IsF8F6F4, ElementBMma, ElementB>;
+
+  using SmemAllocTypeA = cute::sparse_elem<ElementAMmaSparsity,
+                                           cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMmaRaw> < 8,
+                                                               uint8_t,
+                                                               ElementAMmaRaw>>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  // Kernel Input Data Type that consider runtime dtype
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>,
+                                            ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA,
+                                               cute::conditional_t<IsF8F6F4,
+                                                                   cute::UMMA::MXF8F6F4Format,
+                                                                   cute::UMMA::MXF4Format>,
+                                               void*>;
+
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB,
+                                               cute::conditional_t<IsF8F6F4,
+                                                                   cute::UMMA::MXF8F6F4Format,
+                                                                   cute::UMMA::MXF4Format>,
+                                               void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+  static constexpr uint32_t MetadataTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+  static constexpr uint32_t MainLoadTmaTransactionBytes = SFTransactionBytes + ABTmaTransactionBytes;
+
+  template <
+    class AccTensor,
+    class ETensor, class SfaTensor, class SfbTensor
+  >
+  struct TmemStorage {
+    AccTensor accumulators;
+    ETensor tCtE;
+    SfaTensor tCtSFA;
+    SfbTensor tCtSFB;
+  };
+
+  template <
+    class KTileCount,
+    class GTensorPartitionedA, class GTensorPartitionedB, class GTensorPartitionedE,
+    class STensorA, class STensorB, class STensorE,
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    GTensorPartitionedE tEgE_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    STensorE tEsE;
+    GTensorPartitionedSFA tAgSFA_mkl;
+    GTensorPartitionedSFB tBgSFB_nkl;
+    STensorSFA tAsSFA;
+    STensorSFB tBsSFB;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+    uint16_t mcast_mask_e;
+    uint16_t mcast_mask_sfa;
+    uint16_t mcast_mask_sfb;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_, GTensorPartitionedE tEgE_nkl_,
+        STensorA tAsA_, STensorB tBsB_, STensorE tEsE_,
+        GTensorPartitionedSFA tAgSFA_mkl_, GTensorPartitionedSFB tBgSFB_nkl_,
+        STensorSFA tAsSFA_, STensorSFB tBsSFB_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_, uint16_t mcast_mask_e_,
+        uint16_t mcast_mask_sfa_, uint16_t mcast_mask_sfb_)
+    : k_tiles(k_tiles_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_), tEgE_nkl(tEgE_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_), tEsE(tEsE_)
+    , tAgSFA_mkl(tAgSFA_mkl_), tBgSFB_nkl(tBgSFB_nkl_)
+    , tAsSFA(tAsSFA_), tBsSFB(tBsSFB_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_), mcast_mask_e(mcast_mask_e_)
+    , mcast_mask_sfa(mcast_mask_sfa_), mcast_mask_sfb(mcast_mask_sfb_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB,
+    class FragmentE,   class ETiledCopy,   class SmemFrgE,   class TmemFrgE,
+    class FragmentSFA, class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class FragmentSFB, class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    // A
+    FragmentA tCrA;
+    // B
+    FragmentB tCrB;
+    // E
+    FragmentE tCtE;
+    ETiledCopy tiled_copy_s2t_E;
+    SmemFrgE thr_tCsE_s2t;
+    TmemFrgE thr_tCtE_s2t;
+    // SFA
+    FragmentSFA tCtSFA;
+    SFATiledCopy tiled_copy_s2t_SFA;
+    SmemFrgSFA thr_tCsSFA_s2t;
+    TmemFrgSFA thr_tCtSFA_s2t;
+    // SFB
+    FragmentSFB tCtSFB;
+    SFBTiledCopy tiled_copy_s2t_SFB;
+    SmemFrgSFB thr_tCsSFB_s2t;
+    TmemFrgSFB thr_tCtSFB_s2t;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_,
+        FragmentE tCtE_, ETiledCopy tiled_copy_s2t_E_,
+        SmemFrgE thr_tCsE_s2t_, TmemFrgE thr_tCtE_s2t_,
+        FragmentSFA tCtSFA_, SFATiledCopy tiled_copy_s2t_SFA_,
+        SmemFrgSFA thr_tCsSFA_s2t_, TmemFrgSFA thr_tCtSFA_s2t_,
+        FragmentSFB tCtSFB_, SFBTiledCopy tiled_copy_s2t_SFB_,
+        SmemFrgSFB thr_tCsSFB_s2t_, TmemFrgSFB thr_tCtSFB_s2t_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_)
+    , tCtE(tCtE_), tiled_copy_s2t_E(tiled_copy_s2t_E_)
+    , thr_tCsE_s2t(thr_tCsE_s2t_), thr_tCtE_s2t(thr_tCtE_s2t_)
+    , tCtSFA(tCtSFA_), tiled_copy_s2t_SFA(tiled_copy_s2t_SFA_)
+    , thr_tCsSFA_s2t(thr_tCsSFA_s2t_), thr_tCtSFA_s2t(thr_tCtSFA_s2t_)
+    , tCtSFB(tCtSFB_), tiled_copy_s2t_SFB(tiled_copy_s2t_SFB_)
+    , thr_tCsSFB_s2t(thr_tCsSFB_s2t_), thr_tCtSFB_s2t(thr_tCtSFB_s2t_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    // A is A Compressed, not raw tensorA
+    ArrayElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_E = decltype(make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFA = decltype(make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFB = decltype(make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        ClusterLayoutSfb_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_E tma_load_e_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_a_(params.layout_a)
+    , layout_e_(params.layout_e)
+    , layout_SFA_(params.layout_SFA)
+    , layout_SFB_(params.layout_SFB)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_e_ = is_fallback_cluster ? &params.tma_load_e_fallback : &params.tma_load_e;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_e_ = &params.tma_load_e;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    // Cluster layout for TMA construction of SFB
+    auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_E tma_load_e = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_E tma_load_e_fallback = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk);
+
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_e_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      args.layout_a,
+      args.layout_e,
+      args.layout_SFA,
+      args.layout_SFB,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    // Check for Alignment Requirement
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits_v<ElementA>;
+
+    bool implementable = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,     K/2, L),
+                                                                                                    cute::make_stride(_1{}, M,   M*K/2));
+    }
+    else { // If A is K-major
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,    K/2,  L),
+                                                                                                    cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorA\n");
+    }
+
+    // Check Alignment B
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits_v<ElementB>;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorB\n");
+    }
+
+    // Check for AB layout requirement
+    const auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    const auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    implementable = implementable && (layout_a_ref == args.layout_a);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_a mismatch\n");
+    }
+
+    implementable = implementable && (layout_e_ref == args.layout_e);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_e mismatch\n");
+    }
+
+    // Check for SFA SFB layout requirement
+    const auto layout_sfa_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    const auto layout_sfb_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    implementable = implementable && (layout_sfa_ref == args.layout_SFA);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFA mismatch, layout_SFA needs to be K-major\n");
+    }
+
+    implementable = implementable && (layout_sfb_ref == args.layout_SFB);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFB mismatch, layout_SFB needs to be K-major\n");
+    }
+
+    if constexpr (IsRuntimeDataType && detail::is_sm10x_mxf4nvf4_input<ElementAMma>() && detail::is_sm10x_mxf4nvf4_input<ElementBMma>()) {
+      bool is_compatible = (SFVecSize == 32 ||
+                           (SFVecSize == 64 && is_same_v<ElementSF, cutlass::float_ue8m0_t>
+                                            && args.runtime_data_type_a == cute::UMMA::MXF4Format::E2M1
+                                            && args.runtime_data_type_b == cute::UMMA::MXF4Format::E2M1));
+      if (!is_compatible) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: 2x mode (VectorSize=64) only supports float_e2m1_t for a/b types and ue8m0_t for sf type.\n");
+      }
+      implementable &= is_compatible;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_e_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfa_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfb_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
+    Tensor tCtE   = make_tensor<typename TiledMma::FrgTypeE>(take<0,3>(shape(SmemLayoutE{})));
+
+    TmemStorage<decltype(accumulators), decltype(tCtE), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtSFA = tCtSFA;
+    tmem_storage.tCtSFB = tCtSFB;
+    tmem_storage.tCtE = tCtE;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtE.data()         = tmem_base_addr + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+    tmem_storage.tCtSFA.data()       = tmem_storage.tCtE.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtE);
+    tmem_storage.tCtSFB.data()       = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// tAgSFA_mkl - partitioned gmem tensor for SFA
+  /// tBgSFB_nkl - partitioned gmem tensor for SFB
+  /// tAsSFA - partitioned tmem tensor for SFA
+  /// tAsSFB - partitioned tmem tensor for SFB
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(layout_a_.shape());
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+    Tensor mE_mkl = observed_tma_load_e_->get_tma_tensor(layout_e_.shape());
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShapeE{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA_));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp), 
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp), 
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+      }
+    }();
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+    Tensor tCgE_mkl = cta_mma.partition_A(gE_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    ThrMMA cta_mma_sfb = TiledMMA_SF{}.get_slice(blockIdx.x % size(typename TiledMMA_SF::AtomThrID{}));
+    Tensor tCgSFA_mkl = cta_mma.partition_A(gSFA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = cta_mma_sfb.partition_B(gSFB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(sSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tEgE_mkl, tEsE] = tma_partition(*observed_tma_load_e_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sE), group_modes<0,3>(tCgE_mkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+    uint16_t mcast_mask_e = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return LoadParams{
+      size<3>(gA_mkl),                                // for scheduler
+      tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE, // for input tensor values
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,         // for input scale factor tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_e, mcast_mask_sfa, mcast_mask_sfb}; // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A B E matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE) that one UTCCP can provide
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sE));                                     // PIPE
+
+    Tensor tCtE = tmem_storage.tCtE;
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpEOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      cute::SM100_UTCCP_128dp128bit_2cta, cute::SM100_UTCCP_128dp128bit_1cta>;
+    auto tiled_copy_s2t_E = make_utccp_copy(UtccpEOp{}, recast<ElementE>(tCtE));
+
+    auto thr_copy_s2t_E = tiled_copy_s2t_E.get_slice(0);
+    Tensor thr_tCsE_s2t_ = thr_copy_s2t_E.partition_S(recast<ElementE>(sE));
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    Tensor thr_tCsE_s2t = get_utccp_smem_desc_tensor<UtccpEOp>(thr_tCsE_s2t_);
+    Tensor thr_tCtE_s2t = thr_copy_s2t_E.partition_D(recast<ElementE>(tCtE));
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = tmem_storage.tCtSFA;
+    Tensor tCtSFB = tmem_storage.tCtSFB;
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_s2t_);
+    auto thr_tCtSFA_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_s2t_);
+    auto thr_tCtSFB_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB,
+      tCtE,   tiled_copy_s2t_E,   thr_tCsE_s2t,   thr_tCtE_s2t,
+      tCtSFA, tiled_copy_s2t_SFA, thr_tCsSFA_s2t, thr_tCtSFA_s2t,
+      tCtSFB, tiled_copy_s2t_SFB, thr_tCsSFB_s2t, thr_tCtSFB_s2t};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [k_tiles,
+          tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE,
+          tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,
+          mcast_mask_a, mcast_mask_b, mcast_mask_e,
+          mcast_mask_sfa, mcast_mask_sfb] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tEgE = tEgE_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        copy(observed_tma_load_sfa_->with(*tma_barrier, mcast_mask_sfa), tAgSFA(_,*k_tile_iter), tAsSFA(_,write_stage));
+        copy(observed_tma_load_sfb_->with(*tma_barrier, mcast_mask_sfb), tBgSFB(_,*k_tile_iter), tBsSFB(_,write_stage));
+        copy(observed_tma_load_e_->with(*tma_barrier, mcast_mask_e), tEgE(_,*k_tile_iter), tEsE(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma,
+          tCrA, tCrB,
+          tCtE,   tiled_copy_s2t_E,   thr_tCsE_s2t, thr_tCtE_s2t,
+          tCtSFA, tiled_copy_s2t_SFA, thr_tCsSFA_s2t, thr_tCtSFA_s2t,
+          tCtSFB, tiled_copy_s2t_SFB, thr_tCsSFB_s2t, thr_tCtSFB_s2t] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    auto tCtSFB_mma = [tCtSFB = tCtSFB, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB;
+        if (size<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB;
+      }
+    }();
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    bool is_first_iter = true;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if (cute::elect_one_sync()) {
+        copy(tiled_copy_s2t_E,   thr_tCsE_s2t(_,_,_,_,read_stage),   thr_tCtE_s2t);
+        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+      }
+
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      if (is_first_iter) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        is_first_iter = false;
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                  tCtE(_,_,k_block),
+                                  tCtSFA(_,_,k_block),
+                                  tCtSFB_mma(_,_,k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_E const* observed_tma_load_e_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+
+  LayoutA layout_a_;
+  LayoutE layout_e_;
+  LayoutSFA layout_SFA_;
+  LayoutSFB layout_SFB_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp
index b652c89e..c0c3ed15 100644
--- a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp
@@ -848,7 +848,7 @@ struct CollectiveMma<
     cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
   }
 
-private:
+protected:
 
   typename Params::TMA_A const* observed_tma_load_a_{nullptr};
   typename Params::TMA_B const* observed_tma_load_b_{nullptr};
diff --git a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp
index b623ca97..2d23bcde 100644
--- a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp
@@ -1092,7 +1092,8 @@ struct CollectiveMma<
     cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
   }
 
-private:
+protected:
+
   template <class ProblemShape_MNKL>
   CUTLASS_DEVICE
   constexpr auto
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
index f1abb1eb..6be30cbc 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
@@ -30,7 +30,6 @@
  **************************************************************************************************/
 
 
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -421,6 +420,7 @@ struct CollectiveMma<
   can_implement(
       ProblemShape const& problem_shape,
       [[maybe_unused]] Arguments const& args) {
+
     auto problem_shape_MNKL = append<4>(problem_shape, 1);
     auto [M,N,K,L] = problem_shape_MNKL;
 
@@ -667,6 +667,7 @@ struct CollectiveMma<
 
     uint32_t skip_wait = k_tile_count <= 0;
     auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    bool is_first_iter = true;
 
     //
     // PIPELINED MAIN LOOP
@@ -689,6 +690,11 @@ struct CollectiveMma<
       skip_wait = k_tile_count <= 0;
       // Peek at next iteration
       barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      if (is_first_iter) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        is_first_iter = false;
+      }
 
       // Unroll the K mode manually so we can set scale C to 1
       CUTLASS_PRAGMA_UNROLL
@@ -706,11 +712,10 @@ struct CollectiveMma<
     return mainloop_pipe_consumer_state;
   }
 
-private:
+protected:
 
   typename Params::TMA_A const* observed_tma_load_a_{nullptr};
   typename Params::TMA_B const* observed_tma_load_b_{nullptr};
-
   RuntimeDataTypeA runtime_data_type_a_{};
   RuntimeDataTypeB runtime_data_type_b_{};
 
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp
index 705562f8..993335e3 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp
@@ -984,7 +984,8 @@ struct CollectiveMma<
     return cute::make_tuple(mma2accum_pipeline_consumer_state, tTR_rGlobAcc);
   }
 
-private:
+protected:
+
   template <class ProblemShape_MNKL>
   CUTLASS_DEVICE
   constexpr auto
diff --git a/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp
new file mode 100644
index 00000000..b970f95b
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp
@@ -0,0 +1,939 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedSparse<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedSparse<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  static_assert(get<0,0>(MmaShapeA_MK{}) == 128 &&
+                (get<2>(MmaShapeA_MK{}) == 2 || get<2>(MmaShapeA_MK{}) == 4),
+                "This kernel only support MmaShape=128 and 2/4 kphase.");
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) || (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using MainloopPipeline = cutlass::PipelineTmaSparseUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static constexpr int UtccpReuseCnt = ((size<2>(TileShape{}) / typename SparseConfig::TensorEAtomK{}) == 0) ?
+                                        typename SparseConfig::TensorEAtomK{} / size<2>(TileShape{}) : 1;
+  static_assert(UtccpReuseCnt == 1 || UtccpReuseCnt == 2, "UTCCP reuse count can only be either one or two");
+  // (TileM, TileN, TileK) TileK is adjusted according to the reuse.
+  using TileShapeE = decltype(replace<2>(TileShape{}, cute::lcm(size<2>(TileShape{}), typename SparseConfig::TensorEAtomK{})));
+  using MmaShapeE_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShapeE{}), size<2>(TileShapeE{}))));
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide the tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE) that one UTCCP instruction can provide
+  using SmemLayoutE = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomE{},
+      append(MmaShapeE_MK{}, Int<DispatchPolicy::Stages>{})));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    ElementAMmaRaw>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::sparse_elem<ElementAMmaSparsity,
+                                           cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMmaRaw> < 8,
+                                                               uint8_t,
+                                                               ElementAMmaRaw>>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  // Kernel Input Data Type that consider runtime dtype
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>,
+                                            ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+  static constexpr uint32_t MetadataTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+  static constexpr uint32_t MainLoadTmaTransactionBytes = ABTmaTransactionBytes;
+
+  template <class AccTensor, class ETensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+    ETensor tCtE;
+  };
+
+  template <
+    class KTileCount, class KTileMetadataCount,
+    class GTensorPartitionedA, class GTensorPartitionedB, class GTensorPartitionedE,
+    class STensorA, class STensorB, class STensorE
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    KTileMetadataCount k_tiles_metadata;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    GTensorPartitionedE tEgE_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    STensorE tEsE;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+    uint16_t mcast_mask_e;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_, KTileMetadataCount k_tiles_metadata_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_, GTensorPartitionedE tEgE_nkl_,
+        STensorA tAsA_, STensorB tBsB_, STensorE tEsE_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_, uint16_t mcast_mask_e_)
+    : k_tiles(k_tiles_), k_tiles_metadata(k_tiles_metadata_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_), tEgE_nkl(tEgE_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_), tEsE(tEsE_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_), mcast_mask_e(mcast_mask_e_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB,
+    class FragmentE, class ETiledCopy, class SmemFrgE, class TmemFrgE
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    // A
+    FragmentA tCrA;
+    // B
+    FragmentB tCrB;
+    // E
+    FragmentE tCtE;
+    ETiledCopy tiled_copy_s2t_E;
+    SmemFrgE thr_tCsE_s2t;
+    TmemFrgE thr_tCtE_s2t;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_,
+        FragmentE tCtE_, ETiledCopy tiled_copy_s2t_E_,
+        SmemFrgE thr_tCsE_s2t_, TmemFrgE thr_tCtE_s2t_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_)
+    , tCtE(tCtE_), tiled_copy_s2t_E(tiled_copy_s2t_E_)
+    , thr_tCsE_s2t(thr_tCsE_s2t_), thr_tCtE_s2t(thr_tCtE_s2t_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    // A is A Compressed, not raw tensorA
+    ArrayElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_E = decltype(make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_E tma_load_e_fallback;
+    TMA_B tma_load_b_fallback;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_a_(params.layout_a)
+    , layout_e_(params.layout_e)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_e_ = is_fallback_cluster ? &params.tma_load_e_fallback : &params.tma_load_e;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_e_ = &params.tma_load_e;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_E tma_load_e = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_E tma_load_e_fallback = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_e_fallback,
+      tma_load_b_fallback,
+      args.layout_a,
+      args.layout_e,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits_v<ElementA>;
+
+    bool implementable = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,     K/2, L),
+                                                                                                    cute::make_stride(_1{}, M,   M*K/2));
+    }
+    else { // If A is K-major
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,    K/2,  L),
+                                                                                                    cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorA\n");
+    }
+
+    // Check Alignment B
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits_v<ElementB>;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorB\n");
+    }
+
+    // Check for AB layout requirement
+    const auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    const auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    implementable = implementable && (layout_a_ref == args.layout_a);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_a mismatch\n");
+    }
+
+    implementable = implementable && (layout_e_ref == args.layout_e);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_e mismatch\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_e_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtE   = make_tensor<typename TiledMma::FrgTypeE>(take<0,3>(shape(SmemLayoutE{})));
+
+    TmemStorage<decltype(accumulators), decltype(tCtE)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtE = tCtE;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtE.data()         = tmem_base_addr + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(layout_a_.shape());
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+    Tensor mE_mkl = observed_tma_load_e_->get_tma_tensor(layout_e_.shape());
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShapeE{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+    Tensor tCgE_mkl = cta_mma.partition_A(gE_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tEgE_mkl, tEsE] = tma_partition(*observed_tma_load_e_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sE), group_modes<0,3>(tCgE_mkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_e = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return LoadParams{
+      size<3>(gA_mkl), size<3>(gE_mkl),               // for scheduler
+      tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE, // for input tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_e};      // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A B E matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE) that one UTCCP can provide
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sE));                                     // PIPE
+
+    Tensor tCtE = tmem_storage.tCtE;
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpEOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      cute::SM100_UTCCP_128dp128bit_2cta, cute::SM100_UTCCP_128dp128bit_1cta>;
+    auto tiled_copy_s2t_E = make_utccp_copy(UtccpEOp{}, recast<ElementE>(tCtE));
+
+    auto thr_copy_s2t_E = tiled_copy_s2t_E.get_slice(0);
+    Tensor thr_tCsE_s2t_ = thr_copy_s2t_E.partition_S(recast<ElementE>(sE));
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    Tensor thr_tCsE_s2t = get_utccp_smem_desc_tensor<UtccpEOp>(thr_tCsE_s2t_);
+    Tensor thr_tCtE_s2t = thr_copy_s2t_E.partition_D(recast<ElementE>(tCtE));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB,
+      tCtE, tiled_copy_s2t_E, thr_tCsE_s2t, thr_tCtE_s2t};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [k_tiles, k_tiles_metadata,
+          tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE,
+          mcast_mask_a, mcast_mask_b, mcast_mask_e] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tEgE = tEgE_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+    uint32_t iter = 0;
+
+    // K_tile_iter for E
+    auto k_tile_start = cute::crd2idx(k_tile_iter.coord, k_tiles);
+    auto k_utccp_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start / UtccpReuseCnt, k_tiles_metadata), k_tiles_metadata);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      bool load_e = iter % UtccpReuseCnt == 0;
+
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, load_e, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+
+      if (load_e) {
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_e_->with(*tma_barrier, mcast_mask_e), tEgE(_,*k_utccp_tile_iter), tEsE(_,write_stage));
+        }
+        ++k_utccp_tile_iter;
+      }
+
+      ++k_tile_iter;
+      --k_tile_count;
+      iter++;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma,
+          tCrA, tCrB,
+          tCtE, tiled_copy_s2t_E, thr_tCsE_s2t, thr_tCtE_s2t ] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    uint32_t math_mma_e_stage_idx = 0;
+    uint32_t iter = 0;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if (iter % UtccpReuseCnt == 0) {
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_E, thr_tCsE_s2t(_,_,_,_,read_stage), thr_tCtE_s2t);
+        }
+      }
+
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      if (iter == 0) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tCtE(_,_,math_mma_e_stage_idx * UtccpReuseCnt + k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      if constexpr (UtccpReuseCnt != 1) {
+        // Each E Smem Stage contain two CtaK's Metadata when UtccpReuse
+        math_mma_e_stage_idx ^= 1;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+      ++iter;
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_E const* observed_tma_load_e_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  LayoutA layout_a_;
+  LayoutE layout_e_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
new file mode 100755
index 00000000..00fd0338
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
@@ -0,0 +1,888 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+   // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+
+  // Gmem copies
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  // Smem copies
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB{}))>;
+
+  using SmemCopyAtomsA =  SmemCopyAtomsA_;
+  using SmemCopyAtomsB =  SmemCopyAtomsB_;
+
+  using SmemCopyAtomA   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+
+  using SmemCopyAtomB   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using TmaInternalElementSF = ElementSF;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      args.layout_SFA,
+      args.layout_SFB,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_sfa.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_sfb.get_tma_descriptor());
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                          // (m,k,l)
+    Tensor mB_nkl = params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                          // (n,k,l)
+    Tensor mSFA_mkl = params.tma_load_sfa.get_tma_tensor(shape(params.layout_SFA));
+    Tensor mSFB_nkl = params.tma_load_sfb.get_tma_tensor(shape(params.layout_SFB));
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (TILE_N,TILE_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B, SFA and SFB
+      //
+
+      auto [gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl] = load_inputs;
+
+      auto block_tma_a = params.tma_load_a.get_slice(0);
+      auto block_tma_b = params.tma_load_b.get_slice(0);
+
+      auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+      auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA =   gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+      Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+      Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+      // Partition source and destination tensors for tma copies
+      Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                        // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                        // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                        // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                        // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        copy(params.tma_load_sfa.with(*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+
+        // Advance k tile
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+    }
+    __syncwarp();
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] Params const& params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy from smem to registers
+    //
+
+    // A
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    // B
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                          //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                       // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                         //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                        // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                        // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                    // PIPE
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                    // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                    // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                               // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                               // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                              // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                              // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA) == size<2>(sSFA));                                    // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB) == size<2>(sSFA));                                    // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+    auto tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+    auto tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+    
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+
+      
+      // Copy smem->rmem for SFA/SFB operand
+      copy(tCsSFA_stage(_,_,k_block), tCrSFA_copy_view(_,_,k_block));
+      copy(tCsSFB_stage(_,_,k_block), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block)), make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+        
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+          tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+      
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+}
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp
new file mode 100755
index 00000000..050cec18
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// CollectiveMma for A/B with different or same stages based on asymmetric DMA.
+
+template <
+  int StagesA,
+  int StagesB,
+  int StagesE,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class TileShape_,
+  class ElementPairA_,
+  class LayoutPairsA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedSparseBlockScaled<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    LayoutPairsA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using LayoutPairsA = LayoutPairsA_;
+  using StridePairB = StridePairB_;
+  using SmemCopyAtomsA = SmemCopyAtomsA_;
+  using SmemCopyAtomsB = SmemCopyAtomsB_;
+
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparseBlockScaled<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>;
+  using TileShape = TileShape_;
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairsA{}))>;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairsA{}))>;
+  using StrideA =  remove_cvref_t<decltype(get<3>(LayoutPairsA{}))>;
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using RegisterE = typename remove_extent<typename TiledMma::MMA_Op::ERegisters>::type;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  // SFA, SFB and metadata config
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>,
+                                "SFA and SFB data types should be the same");
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<2>(LayoutPairsA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopyB = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB_{}))>;;
+  using SmemCopyAtomA = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomE = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomB = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemLayoutAtomA = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA_{}))>;
+  using SmemLayoutAtomB = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB_{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA_{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB_{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<2>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+  using GmemTiledCopySFA = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopySFB = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB_{}))>;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using GmemTiledCopyE = GmemTiledCopyA;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with TILEK, and STAGEs.
+  //    It let AsymmetricKRatio equals TILEK_A / TILEK_B, to make sure A/B's
+  //    pipeline keep same steps when procude / consume data.
+  // Currently, AsymmetricKRatio = {1, 2} is the only support.
+  static constexpr int AsymmetricKRatio = DispatchPolicy::StagesA != DispatchPolicy::StagesB ? 2 : 1;
+
+  // Construct TileShape for SFB load from GMEM to SMEM.
+  // It is required to keep consistency with BlockScaled granularity defined in Sm1xxBlkScaledConfig.
+  // So that TileShape for scaling factor needs to be defined as a mutliple of Blk_MN.
+  using Blk_MN      = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using TileShapeSF = decltype(make_shape(ceil_div(size<0>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{},
+                                           ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{},
+                                           shape<2>(CtaShape_MNK{})));
+  using TileShapeB = decltype(make_shape(size<0>(TileShape{}),
+                                         size<1>(TileShape{}),
+                                         ceil_div(size<2>(TileShape{}), Int<AsymmetricKRatio>{})));
+
+  static constexpr int ThreadCount = size(TiledMma{});
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static constexpr int TensorAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int TensorEMmaSparsity = ElementEMma::sparsity;
+
+  // Use two MainloopPipeline for A and B separately.
+  using MainloopPipelineMK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesA>;
+  using MainloopPipelineNK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesB>;
+  using PipelineStateMK  = typename cutlass::PipelineState<DispatchPolicy::StagesA>;
+  using PipelineStateNK  = typename cutlass::PipelineState<DispatchPolicy::StagesB>;
+  using PipelineParams = typename MainloopPipelineMK::Params;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  // Note: SmemA, SmemSFA and SmemSFB are with same stages, while SmemB is with another stage number.
+  // SmemSFB is not with same stages as SmemB, as it will not design 1.5x stages if Smem not enough.
+  // These different stages setting could maximize capacity of latency hide, while keep data in SMEM.
+  // Metadata may kept in SMEM, or in GMEM/L2, if under SMEM limitation.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesA>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{}), Int<DispatchPolicy::StagesB>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::StagesA>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::StagesA>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::StagesA >= 2, "Specialization requires StagesA set to value 2 or more.");
+  static_assert(DispatchPolicy::StagesB >= 2, "Specialization requires StagesB set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // Is E kept in SMEM or GMEM
+  static constexpr bool UseSmemE = DispatchPolicy::StagesE != 0;
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaSourceElementA = cute::conditional_t<IsF8F6F4, ElementA, uint8_t>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  // Set shared memory layout
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, sparse_elem<TensorAMmaSparsity, uint8_t>, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, ElementBMma>;
+
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<TensorAMmaSparsity>>;
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<TensorEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+  using SmemLayoutE = decltype(tile_to_shape(
+                  SmemLayoutAtomE{},
+                  make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesE>{}),
+                  conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  static constexpr int SmemSizeE  = UseSmemE ? cosize(SmemLayoutE{}) : 0;
+  static constexpr int StageSizeE = UseSmemE ? cosize(take<0,2>(SmemLayoutE{})) : 0;
+  // Check if metetata fetching needs predication
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  static constexpr bool IsELoadPred = not (TensorEAtomM{} == size<0>(TileShape{}) && TensorEAtomK{} == size<2>(TileShape{}));
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  // Set the bytes transferred in this TMA transaction
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(StageSizeE * cute::sizeof_bits_v<ElementEMma>));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+      cute::ArrayEngine<ElementEMma, Int<SmemSizeE>{}> smem_E;
+    } tensors;
+
+    using PipelineStorageMK = typename MainloopPipelineMK::SharedStorage;
+    using PipelineStorageNK = typename MainloopPipelineNK::SharedStorage;
+    alignas(16) PipelineStorageMK pipeline_storage_mk;
+    alignas(16) PipelineStorageNK pipeline_storage_nk;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorageMK = typename SharedStorage::PipelineStorageMK;
+  using PipelineStorageNK = typename SharedStorage::PipelineStorageNK;
+
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<sparse_elem<TensorAMmaSparsity,TmaSourceElementA>>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{}));
+    using TMA_E = decltype(make_tma_copy<ElementE>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeSF{}), shape<2>(TileShapeSF{})),
+        _1{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_E tma_load_e;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    ElementE const* ptr_E{nullptr};
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<sparse_elem<TensorAMmaSparsity, TmaSourceElementA>>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShapeB{})),
+        _1{});
+    typename Params::TMA_E tma_load_e = make_tma_copy<ElementE>(
+        GmemTiledCopyE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeSF{}), shape<2>(TileShapeSF{})),
+        _1{});
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_e,
+      tma_load_sfa,
+      tma_load_sfb,
+      args.layout_a,
+      args.layout_e,
+      args.layout_SFA,
+      args.layout_SFB,
+      args.ptr_E
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::upcast<2>(make_layout(make_shape(M, K, L), StrideA{})));
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfa.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfb.get_tma_descriptor());
+    if constexpr (UseSmemE) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    }
+  }
+
+  /// Create fragment for metadata. The function is referred from thrfrg_A(...)
+  template <class Tensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_E(Tensor&& tensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma) {
+    CUTE_STATIC_ASSERT_V(rank(tensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutE_TV = typename Atom::Traits::ELayout;
+
+    auto t_tile = make_tile(get<0>(TiledPerm{}),
+                            get<2>(TiledPerm{}));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+    auto t_tensor = logical_divide(tensor, t_tile);
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);                               // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    // Fragment layout
+    return thr_tensor;
+  }
+
+  /// get metadata TV
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutE_TV(TiledMma& mma)
+  {
+      // (M,K) -> (M,K)
+      auto tile_shape_mnk = tile_shape(mma);
+      auto ref_E = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+      auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+      // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+      auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+      // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+      auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+      // (thr_idx,val) -> (M,K)
+      return thrfrg_E(ref_E, mma).compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Partitioning for metadata.
+  template <class Tensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_E(Tensor&& tensor, ThrMma& thread_mma) {
+    auto thr_tensor = make_tensor(static_cast<Tensor&&>(tensor).data(), thrfrg_E(tensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition = thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ThrMma::Atom::Traits::ValTypeE>(partition.layout());
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                                                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);                             // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                                                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                                   // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);                             // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());             // (m,k,l)
+    Tensor mSFA_mkl = mainloop_params.tma_load_sfa.get_tma_tensor(shape(mainloop_params.layout_SFA));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = mainloop_params.tma_load_sfb.get_tma_tensor(shape(mainloop_params.layout_SFB));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), _2{});
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp), 
+                                       make_shape( make_shape(_2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(_0{}),   x)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return mainloop_params.tma_load_sfb.get_tma_tensor(shape(mainloop_params.layout_SFB));
+      }
+    }();
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // ( BLK_M, BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShapeB{}, make_coord(_,_,_), Step< X,_1,_1>{});       // ( BLK_N, BLK_K,n,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // ( BLK_N, BLK_K,n,k,l)
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShapeSF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  template<class MainloopPipeline, class PipelineState>
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  // Issues loads for A/E/SF only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_MK(
+      Params const& params,
+      MainloopPipelineMK pipeline,
+      PipelineStateMK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});   // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});   // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and E
+    //
+
+    Tensor gA_mkl = get<0>(load_inputs);                                                             // (BLK_M,BLK_K,k)
+    Tensor gE_mkl = get<2>(load_inputs);                                                             // (BLK_M,BLK_K,k)
+    Tensor gSFA_mkl = get<3>(load_inputs);                                                           // (BLK_M,BLK_K,k)
+    Tensor gSFB_nkl = get<4>(load_inputs);                                                           // (BLK_N,BLK_K,k)
+
+    auto block_tma_a = params.tma_load_a.get_slice(0);
+    auto block_tma_e = params.tma_load_e.get_slice(0);
+    auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+    auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+    Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+    // Partition source and destination tensors for tma copies
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tEgE = block_tma_e.partition_S(gE);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tEsE = block_tma_e.partition_D(sE);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                          // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                          // (TMA,TMA_M,TMA_K,PIPE)
+    Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                          // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                          // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      using BarrierType = typename MainloopPipelineMK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_sfa.with(*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+        if constexpr (UseSmemE) {
+          copy(params.tma_load_e.with(*tma_barrier), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        }
+      }
+
+      if constexpr (!UseSmemE) {
+        // Prefetch 1 stage of E data to L2 in advance
+        auto blk_coord_mkl = make_coord(get<0>(blk_coord), *k_tile_iter, get<3>(blk_coord));         // (BLK_M,BLK_K,L)
+        prefetch(make_local_E(params, blk_coord_mkl));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  // Issues loads for B/SF only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_NK(
+      Params const& params,
+      MainloopPipelineNK pipeline,
+      PipelineStateNK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for B
+    //
+
+    Tensor gB_nkl = get<1>(load_inputs);
+    auto block_tma_b = params.tma_load_b.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Partition source and destination tensors for tma copies
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      using BarrierType = typename MainloopPipelineNK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  // Local tile E from global memory.
+  template<class BlockCoord>
+  CUTLASS_DEVICE auto
+  make_local_E(Params const& mainloop_params,
+               BlockCoord const& blk_coord) {
+    // E layout
+    auto layoutE = mainloop_params.layout_e;
+    // E data pointer as sparse datatype
+    auto ptr_E = recast_ptr<ElementEMma>(mainloop_params.ptr_E);
+
+    // Global gmem E
+    Tensor gE = make_tensor(make_gmem_ptr(ptr_E), layoutE);                                      // (BLK_M,BLK_K,BLK_L)
+    // Local tile E
+    return local_tile(gE, select<0,2>(TileShape{}), blk_coord);                                        // (BLK_M,BLK_K)
+  }
+
+  // Load E from global memory to registers.
+  template<bool IsF8F6F4, class BlockCoord, class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_E(Params const& mainloop_params,
+         BlockCoord const& blk_coord,
+         ProblemShape_MNKL const& problem_shape_MNKL,
+         int thread_idx) {
+    // Workload
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [m_coord, k_coord, l_coord] = blk_coord;
+    auto Shape_MK = cute::make_tuple(M, K);
+
+    // Tiled mma and thread mma
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // Tile shape
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    // Re-sue copy atom E from SmemCopyAtomE
+    using GmemCopyAtomeE = SmemCopyAtomE;
+    // Gmem tile copy
+    auto gmem_tiled_copy_E = make_tiled_copy_impl(GmemCopyAtomeE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    // Gmem thread copy
+    auto gmem_thr_copy_E = gmem_tiled_copy_E.get_thread_slice(thread_idx);
+    // Gmem local E
+    auto gE_mkl = make_local_E(mainloop_params, blk_coord);
+    // Tiled gmem E
+    Tensor tCgE = gmem_thr_copy_E.partition_S(gE_mkl);                                             // (CPY,CPY_M,CPY_K)
+    // Tiled register E and copy view
+    Tensor tCrE = partition_fragment_E(gE_mkl, thread_mma);                                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrE_copy_view = gmem_thr_copy_E.retile_D(tCrE);                                        // (CPY,CPY_M,CPY_K)
+
+    if constexpr (IsF8F6F4) {
+      auto get_copy_atom_and_common_vec = [&]() {
+        using ValType = typename decltype(tCrE)::value_type;
+        // Get maximum copy vector size (logically)
+        auto common_layout = max_common_layout(tCgE, tCrE);
+        auto vec_elem = cute::min(size(common_layout), Int<128 / sizeof_bits_v<ValType>>{});
+        auto common_vec = composition(common_layout, vec_elem);
+        // Compose a Copy_Atom
+        using VecType = uint_bit_t<vec_elem * sizeof_bits_v<ValType>>;
+        using cpy = Copy_Atom<UniversalCopy<VecType>, ValType>;
+        return cute::make_tuple(cpy{}, common_vec);
+      };
+
+      // Copy depends on whether predication is needed
+      if constexpr (IsELoadPred) {
+        // Get predication based on logical element coordinates.
+        Tensor cE_mk = local_tile(
+                make_identity_tensor(Shape_MK),
+                make_shape(get<0>(TileShape{}), get<2>(TileShape{})),
+                make_shape(m_coord, k_coord));                                                          // (BLK_M, BLK_K)
+        Tensor tCcE = gmem_thr_copy_E.partition_S(cE_mk);                                            // (CPY,CPY_M,CPY_K)
+        auto [atom, vec] = get_copy_atom_and_common_vec();
+        // Coordinate comparison for out of bound (OOB) predication
+        Tensor tZcE = zipped_divide(tCcE, vec);
+        auto pred_fn = [&](auto coord){ return cute::elem_less(tZcE(Int<0>{}, coord), Shape_MK); };
+        // Copy
+        cute::copy_if(atom, pred_fn, zipped_divide(tCgE, vec), zipped_divide(tCrE_copy_view, vec));
+      }
+      else {
+        // Copy
+        cute::copy(cute::AutoVectorizingCopyWithAssumedAlignment<32>{}, tCgE, tCrE_copy_view);
+      }
+    }
+    return tCrE;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC,
+    class KTileIterator,
+    class CtaTileCoord,
+    class ProblemShape_MNKL
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipelineMK pipeline_mk,
+      PipelineStateMK smem_pipe_read_mk,
+      MainloopPipelineNK pipeline_nk,
+      PipelineStateNK smem_pipe_read_nk,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params,
+      CtaTileCoord const& cta_tile_coord,
+      ProblemShape_MNKL const& problem_shape_MNKL) {
+    using namespace cute;
+
+    CUTE_STATIC_ASSERT(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});   // (BLK_M,BLK_K,PIPE)
+    auto SmemLayoutSFB_Ld = [SLayoutSFB = SmemLayoutSFB{}]() {
+      if constexpr (IsCtaN64) {
+        auto SLayoutSFB_tmp = SLayoutSFB;
+        auto  new_shape =  make_shape (make_shape(make_shape(shape<0,0,0>(SLayoutSFB_tmp),
+                                    shape<0,0,1>(SLayoutSFB_tmp) / _2{}), shape<0,1>(SLayoutSFB_tmp)),
+                                    shape<1>(SLayoutSFB_tmp), shape<2>(SLayoutSFB_tmp));
+        auto new_stride = stride(SLayoutSFB_tmp);
+        return make_layout(new_shape, new_stride);
+      }
+      else {
+        return SLayoutSFB;
+      }
+    }();
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()) +
+                (IsCtaN64 && get<1>(cta_tile_coord) % 2 == 1 ? 8 : 0), SmemLayoutSFB_Ld);         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define A/B/E partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                               // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                               // (MMA,MMA_N,MMA_K)
+    Tensor tCrE = partition_fragment_E(sE(_,_,Int<0>{}), thread_mma);                              // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                        // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy Atom A, B and E retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+          as_position_independent_swizzle_tensor(sA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                  //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+         as_position_independent_swizzle_tensor(sB));                                         // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                  //      (CPY,CPY_N,CPY_K)
+
+    auto tile_shape_mnk    = tile_shape(tiled_mma);
+    auto smem_tiled_copy_E = make_tiled_copy_impl(SmemCopyAtomE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+    Tensor tCsE            = smem_thr_copy_E.partition_S(
+                                  as_position_independent_swizzle_tensor(sE));                // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrE_copy_view  = smem_thr_copy_E.retile_D(tCrE);                                  //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                          // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                            //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCsE) == size<1>(tCrE_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB) * Int<AsymmetricKRatio>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == Int<DispatchPolicy::StagesA>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == Int<DispatchPolicy::StagesB>{});
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesB>{} == size<2>(sB));
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                       // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                       // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                                  // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                                  // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                                 // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                                 // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA)     == size<2>(sSFA));                                   // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sSFB)   == Int<DispatchPolicy::StagesA>{});                  // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB)     == Int<DispatchPolicy::StagesB>{});                  // PIPE
+
+    if constexpr (UseSmemE) {
+      CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sE));
+    }
+
+    //
+    // DEFINE FUNCTIONS FOR PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineStateMK smem_pipe_release_mk = smem_pipe_read_mk;
+    PipelineStateNK smem_pipe_release_nk = smem_pipe_read_nk;
+
+    // Wait consumer barrier MK
+    auto wait_barrier_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_mk = pipeline_mk.consumer_try_wait(smem_pipe_read_mk);
+      pipeline_mk.consumer_wait(smem_pipe_read_mk, barrier_token_mk);
+    };
+
+    // Wait consumer barrier NK
+    auto wait_barrier_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_nk = pipeline_nk.consumer_try_wait(smem_pipe_read_nk);
+      pipeline_nk.consumer_wait(smem_pipe_read_nk, barrier_token_nk);
+    };
+
+    // Release consumer barrier MK, and move forward
+    auto release_advance_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_mk.consumer_release(smem_pipe_release_mk);
+      ++smem_pipe_read_mk;
+      ++smem_pipe_release_mk;
+    };
+
+    // Release consumer barrier NK, and move forward
+    auto release_advance_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_nk.consumer_release(smem_pipe_release_nk);
+      ++smem_pipe_read_nk;
+      ++smem_pipe_release_nk;
+    };
+
+    // Copy A from SMEM to register, and do transform if needed
+    auto copy_transform_A = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrA_copy_view(_,m_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,m_block,k_block));
+    };
+
+    // Copy B from SMEM to register, and do transform if needed
+    auto copy_transform_B = [&](auto n_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for B operand
+      copy(smem_tiled_copy_B, tCsB(_,n_block,k_block,smem_pipe_read_nk.index()), tCrB_copy_view(_,n_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,n_block,k_block));
+    };
+
+    // Copy SFA from SMEM to register
+    auto copy_SFA = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy smem->rmem for SFA operand
+      copy(tCsSFA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrSFA_copy_view(_,m_block,k_block));
+    };
+
+    // Copy SFB of all Ns from SMEM to register
+    auto copy_SFBs = [&](auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy smem->rmem for SFB operand
+      copy(tCsSFB(_,_,k_block,smem_pipe_read_mk.index()), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    // Copy E from SMEM to register
+    auto copy_E = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for E operand
+      copy( recast<RegisterE>(tCsE(_,m_block,k_block,smem_pipe_read_mk.index())),
+            recast<RegisterE>(tCrE_copy_view(_,m_block,k_block)));
+    };
+
+    constexpr auto M_BLOCK_MAX = size<1>(tCrA);
+    constexpr auto N_BLOCK_MAX = size<1>(tCrB);
+    constexpr auto K_BLOCK_MAX = size<2>(tCrA);
+    constexpr auto K_BLOCK_STEP = K_BLOCK_MAX / Int<AsymmetricKRatio>{};
+
+    // Perform mainloop gemm, when E is in SMEM.
+    auto gemm_loop_with_SmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      // Load A/B/E/SFA/SFB, then do gemm.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+        // Copy smem->rmem for A/B/E operand
+        copy_transform_A(_, k_block);
+        copy_transform_B(_, k_block);
+        copy_E(_, k_block);
+
+        // Copy smem->rmem for SFA/SFB operand
+        copy_SFA(_, k_block);
+        copy_SFBs(k_block);
+
+        // Gemm
+        cute::gemm(tiled_mma,
+                  make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block), tCrE(_,_,k_block)),
+                  make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)),
+                  accum);
+
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    // Perform mainloop gemm, when E is in GMEM.
+    auto gemm_loop_with_GmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy gmem->rmem for E operand
+      auto blk_coord = make_coord(get<0>(cta_tile_coord), *k_tile_iter, get<3>(cta_tile_coord));     // (BLK_M,BLK_K,L)
+      Tensor tCrE = load_E<IsF8F6F4>(mainloop_params, blk_coord, problem_shape_MNKL, thread_idx);
+      ++k_tile_iter;
+
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        // Copy smem->rmem for SFB operand. SFB needs to be copied with all N_BLOCK_MAX,
+        //   as each LDS loads several groups of data needed by one MMA instruction.
+        copy_SFBs(k_block);
+
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+            copy_SFA(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrSFA(_,m_block,k_block), tCrE(_,m_block,k_block)),
+                      make_zip_tensor(tCrB(_,n_block,k_block), tCrSFB(_,n_block,k_block)),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline_nk
+      release_advance_nk();
+      // Wait next buffer
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        auto k_block_a = k_block + K_BLOCK_STEP;
+
+        // Copy smem->rmem for SFB operand. SFB needs to be copied with all N_BLOCK_MAX,
+        //   as each LDS loads several groups of data needed by one MMA instruction.
+        copy_SFBs(k_block_a);
+
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block_a);
+            copy_SFA(m_block, k_block_a);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block_a), tCrSFA(_,m_block,k_block_a), tCrE(_,m_block,k_block_a)),
+                      make_zip_tensor(tCrB(_,n_block,k_block), tCrSFB(_,n_block,k_block_a)),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // Case when A/B with same stages, and keep E in SMEM.
+      if constexpr (UseSmemE) {
+        gemm_loop_with_SmemE();
+      }
+      // Case when A/B with different stages, and keep E in GMEM.
+      else {
+        gemm_loop_with_GmemE();
+      } // end if
+
+    }
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipelineMK, PipelineStateMK, MainloopPipelineNK, PipelineStateNK, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_mma_tma.hpp b/include/cutlass/gemm/collective/sm120_mma_tma.hpp
new file mode 100644
index 00000000..76c1028d
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm120_mma_tma.hpp
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecialized<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120TmaWarpSpecialized<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+  
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::array_aligned<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::array_aligned<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});    // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});    // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+
+    //
+    // Copy Atom A and B retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+    
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+        
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+      
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp b/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp
new file mode 100644
index 00000000..b3566e2e
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp
@@ -0,0 +1,990 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesA,
+  int StagesB,
+  int StagesE,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomPairA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedSparse<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomPairA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparse<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA =  remove_cvref_t<decltype(get<2>(LayoutPairAE{}))>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = remove_cvref_t<decltype(get<0>(SmemCopyAtomPairA_{}))>;
+  using SmemCopyAtomE = remove_cvref_t<decltype(get<1>(SmemCopyAtomPairA_{}))>;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using GmemTiledCopyE = GmemTiledCopyA_;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+  using RegisterE = typename remove_extent<typename TiledMma::MMA_Op::ERegisters>::type;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+  
+  static constexpr int ThreadCount = size(TiledMma{});
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with TILEK, and STAGEs. 
+  //    It let AsymmetricKRatio equals TILEK_A / TILEK_B, to make sure A/B's  
+  //    pipeline keep same steps when procude / consume data.
+  static constexpr int AsymmetricKRatio = DispatchPolicy::StagesA != DispatchPolicy::StagesB ? 2 : 1;
+
+  using TileShapeB = decltype(make_shape(size<0>(TileShape{}),
+                                         size<1>(TileShape{}),
+                                         ceil_div(size<2>(TileShape{}), Int<AsymmetricKRatio>{})));
+
+  // Use two MainloopPipeline for A and B separately.
+  using MainloopPipelineMK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesA>;
+  using MainloopPipelineNK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesB>;
+
+  using PipelineParams = typename MainloopPipelineMK::Params;
+  using PipelineStateMK  = typename cutlass::PipelineState<DispatchPolicy::StagesA>;
+  using PipelineStateNK  = typename cutlass::PipelineState<DispatchPolicy::StagesB>;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  static_assert(DispatchPolicy::StagesA >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(DispatchPolicy::StagesB >= 2, "Specialization requires Stages set to value 2 or more.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesA>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{}), Int<DispatchPolicy::StagesB>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // Is E kept in SMEM or GMEM
+  static constexpr bool UseSmemE = DispatchPolicy::StagesE != 0;
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  // Set shared memory layout
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, sparse_elem<ElementAMmaSparsity, uint8_t>, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, ElementBMma>;
+
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<
+                                    ElementAMma,
+                                    cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                    ElementEMma>;
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+  using SmemLayoutE = decltype(tile_to_shape(
+                  SmemLayoutAtomE{},
+                  make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesE>{}),
+                  conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  static constexpr int SmemSizeE  = UseSmemE ? cosize(SmemLayoutE{}) : 0;
+  static constexpr int StageSizeE = UseSmemE ? cosize(take<0,2>(SmemLayoutE{})) : 0;
+  // Check if metetata fetching needs predicator
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  static constexpr bool IsELoadPred = not (TensorEAtomM{} == size<0>(TileShape{}) && TensorEAtomK{} == size<2>(TileShape{}));
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  // Set the bytes transferred in this TMA transaction
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+    cutlass::bits_to_bytes(StageSizeE * cute::sizeof_bits_v<ElementEMma>));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, Int<SmemSizeE>{}> smem_E;
+    } tensors;
+
+    using PipelineStorageMK = typename MainloopPipelineMK::SharedStorage;
+    using PipelineStorageNK = typename MainloopPipelineNK::SharedStorage;
+    alignas(16) PipelineStorageMK pipeline_storage_mk;
+    alignas(16) PipelineStorageNK pipeline_storage_nk;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorageMK = typename SharedStorage::PipelineStorageMK;
+  using PipelineStorageNK = typename SharedStorage::PipelineStorageNK;
+
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<sparse_elem<ElementAMmaSparsity,ElementA>>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{}));
+    using TMA_E = decltype(make_tma_copy<ElementE>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_E tma_load_e;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    ElementE const* ptr_E{nullptr};
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<sparse_elem<ElementAMmaSparsity, ElementA>>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{});
+    typename Params::TMA_E tma_load_e = make_tma_copy<ElementE>(
+        GmemTiledCopyE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_e,
+      args.layout_a,
+      args.layout_e,
+      args.ptr_E
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::upcast<2>(make_layout(make_shape(M, K, L), StrideA{})));
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    if constexpr (UseSmemE) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    }
+  }
+
+  /// Create fragment for metadata. The function is referred from thrfrg_A(...)
+  template <class Tensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_E(Tensor&& tensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma) {
+    CUTE_STATIC_ASSERT_V(rank(tensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutE_TV = typename Atom::Traits::ELayout;
+
+    auto t_tile = make_tile(get<0>(TiledPerm{}),
+                            get<2>(TiledPerm{}));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+    auto t_tensor = logical_divide(tensor, t_tile);
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);                               // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+    
+    // Fragment layout
+    return thr_tensor;
+  }
+
+  /// get metadata TV
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutE_TV(TiledMma& mma)
+  {
+      // (M,K) -> (M,K)
+      auto tile_shape_mnk = tile_shape(mma);
+      auto ref_E = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+      auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+      // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+      auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+      // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+      auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+      // (thr_idx,val) -> (M,K)
+      return thrfrg_E(ref_E, mma).compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Partitioning for metadata.
+  template <class Tensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_E(Tensor&& tensor, ThrMma& thread_mma) {
+    auto thr_tensor = make_tensor(static_cast<Tensor&&>(tensor).data(), thrfrg_E(tensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition = thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ThrMma::Atom::Traits::ValTypeE>(partition.layout());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());             // (m,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{},  make_coord(_,_,_), Step<_1, X,_1>{});       // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShapeB{}, make_coord(_,_,_), Step< X,_1,_1>{});       // (BLK_N,BLK_K,n,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{},  make_coord(_,_,_), Step<_1, X,_1>{});       // (BLK_N,BLK_K,n,k,l)
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Issues loads for A/E only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_MK(
+      Params const& mainloop_params,
+      MainloopPipelineMK pipeline,
+      PipelineStateMK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+
+    // Prepare the TMA loads for A and B
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gE_mkl = get<2>(load_inputs);
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(0);
+    auto block_tma_e = mainloop_params.tma_load_e.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,  k)
+    Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,  k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+    Tensor tEgE = block_tma_e.partition_S(gE);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tEsE = block_tma_e.partition_D(sE);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipelineMK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        if constexpr (UseSmemE) {
+          copy(mainloop_params.tma_load_e.with(*tma_barrier), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        }
+      }
+
+      if constexpr (!UseSmemE) {
+        auto blk_coord_mkl = make_coord(get<0>(blk_coord), *k_tile_iter, get<3>(blk_coord));         // (BLK_M,BLK_K,L)
+        prefetch(make_local_E(mainloop_params, blk_coord_mkl));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Issues loads for B only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_NK(
+      Params const& mainloop_params,
+      MainloopPipelineNK pipeline,
+      PipelineStateNK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});     //     (BLK_N,BLK_K,PIPE)
+
+    // Prepare the TMA loads for A and B
+    Tensor gB_nkl = get<1>(load_inputs);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                //     (BLK_N,BLK_K,   k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipelineNK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  template<class MainloopPipeline, class PipelineState>
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  // Local tile E from global memory.
+  template<class BlockCoord>
+  CUTLASS_DEVICE auto
+  make_local_E(Params const& mainloop_params,
+               BlockCoord const& blk_coord) {
+    // E layout
+    auto layoutE = mainloop_params.layout_e;
+    // E data pointer as sparse datatype
+    auto ptr_E = recast_ptr<ElementEMma>(mainloop_params.ptr_E);
+
+    // Global gmem E
+    Tensor gE = make_tensor(make_gmem_ptr(ptr_E), layoutE);                                      // (BLK_M,BLK_K,BLK_L)
+    // Local tile E
+    return local_tile(gE, select<0,2>(TileShape{}), blk_coord);                                        // (BLK_M,BLK_K)
+  }
+
+  // Load E from global memory to registers.
+  template<bool IsF8F6F4, class BlockCoord, class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_E(Params const& mainloop_params,
+         BlockCoord const& blk_coord,
+         ProblemShape_MNKL const& problem_shape_MNKL,
+         int thread_idx) {
+    // Workload
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [m_coord, k_coord, l_coord] = blk_coord;
+    auto Shape_MK = cute::make_tuple(M, K);
+
+    // Tiled mma and thread mma
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // Tile shape
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    // Re-sue copy atom E from SmemCopyAtomE
+    using GmemCopyAtomeE = SmemCopyAtomE;
+    // Gmem tile copy
+    auto gmem_tiled_copy_E = make_tiled_copy_impl(GmemCopyAtomeE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    // Gmem thread copy
+    auto gmem_thr_copy_E = gmem_tiled_copy_E.get_thread_slice(thread_idx);
+    // Gmem local E
+    auto gE_mkl = make_local_E(mainloop_params, blk_coord);
+    // Tiled gmem E
+    Tensor tCgE = gmem_thr_copy_E.partition_S(gE_mkl);                                             // (CPY,CPY_M,CPY_K)
+    // Tiled register E and copy view
+    Tensor tCrE = partition_fragment_E(gE_mkl, thread_mma);                                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrE_copy_view = gmem_thr_copy_E.retile_D(tCrE);                                        // (CPY,CPY_M,CPY_K)
+
+    if constexpr (IsF8F6F4) {
+      auto get_copy_atom_and_common_vec = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+        using ValType = typename decltype(tCrE)::value_type;
+        // Get maximum copy vector size (logically)
+        auto common_layout = max_common_layout(tCgE, tCrE);
+        auto vec_elem = cute::min(size(common_layout), Int<128 / sizeof_bits_v<ValType>>{});
+        auto common_vec = composition(common_layout, vec_elem);
+        // Compose a Copy_Atom
+        using VecType = uint_bit_t<vec_elem * sizeof_bits_v<ValType>>;
+        using cpy = Copy_Atom<UniversalCopy<VecType>, ValType>;
+        return cute::make_tuple(cpy{}, common_vec);
+      };
+
+      // Copy depends on whether predication is needed
+      if constexpr (IsELoadPred) {
+        // Get predication based on logical element coordinates.
+        Tensor cE_mk = local_tile(
+                make_identity_tensor(Shape_MK), 
+                make_shape(get<0>(TileShape{}), get<2>(TileShape{})), 
+                make_shape(m_coord, k_coord));                                                          // (BLK_M, BLK_K)
+        Tensor tCcE = gmem_thr_copy_E.partition_S(cE_mk);                                            // (CPY,CPY_M,CPY_K)
+        auto [atom, vec] = get_copy_atom_and_common_vec();
+        // Coordinate comparison for out of bound (OOB) predication
+        Tensor tZcE = zipped_divide(tCcE, vec);
+        auto pred_fn = [&](auto coord){ return cute::elem_less(tZcE(Int<0>{}, coord), Shape_MK); };
+        // Copy
+        cute::copy_if(atom, pred_fn, zipped_divide(tCgE, vec), zipped_divide(tCrE_copy_view, vec));
+      }
+      else {
+        // Copy
+        cute::copy(cute::AutoVectorizingCopyWithAssumedAlignment<32>{}, tCgE, tCrE_copy_view);
+      }
+    }
+    return tCrE;
+  }
+  
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC,
+    class KTileIterator,
+    class CtaTileCoord,
+    class ProblemShape_MNKL
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipelineMK pipeline_mk,
+      PipelineStateMK smem_pipe_read_mk,
+      MainloopPipelineNK pipeline_nk,
+      PipelineStateNK smem_pipe_read_nk,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params,
+      CtaTileCoord const& cta_tile_coord,
+      ProblemShape_MNKL const& problem_shape_MNKL) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+
+    //
+    // Define A/B/E partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                               // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                               // (MMA,MMA_N,MMA_K)
+    Tensor tCrE = partition_fragment_E(sE(_,_,Int<0>{}), thread_mma);                              // (MMA,MMA_M,MMA_K)
+
+    //
+    // Copy Atom A, B and E retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+          as_position_independent_swizzle_tensor(sA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                  //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+         as_position_independent_swizzle_tensor(sB));                                         // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                  //      (CPY,CPY_N,CPY_K)
+
+    auto tile_shape_mnk    = tile_shape(tiled_mma);
+    auto smem_tiled_copy_E = make_tiled_copy_impl(SmemCopyAtomE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+    Tensor tCsE            = smem_thr_copy_E.partition_S(
+                                  as_position_independent_swizzle_tensor(sE));                // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrE_copy_view  = smem_thr_copy_E.retile_D(tCrE);                                  //      (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCsE) == size<1>(tCrE_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB) * Int<AsymmetricKRatio>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == Int<DispatchPolicy::StagesA>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == Int<DispatchPolicy::StagesB>{});
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesB>{} == size<2>(sB));
+    if constexpr (UseSmemE) {
+      CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sE));
+    }
+
+    //
+    // DEFINE FUNCTIONS FOR PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineStateMK smem_pipe_release_mk = smem_pipe_read_mk;
+    PipelineStateNK smem_pipe_release_nk = smem_pipe_read_nk;
+
+    // Wait consumer barrier MK
+    auto wait_barrier_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_mk = pipeline_mk.consumer_try_wait(smem_pipe_read_mk);
+      pipeline_mk.consumer_wait(smem_pipe_read_mk, barrier_token_mk);
+    };
+
+    // Wait consumer barrier NK
+    auto wait_barrier_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_nk = pipeline_nk.consumer_try_wait(smem_pipe_read_nk);
+      pipeline_nk.consumer_wait(smem_pipe_read_nk, barrier_token_nk);
+    };
+
+    // Release consumer barrier MK, and move forward
+    auto release_advance_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_mk.consumer_release(smem_pipe_release_mk);
+      ++smem_pipe_read_mk;
+      ++smem_pipe_release_mk;
+    };
+
+    // Release consumer barrier NK, and move forward
+    auto release_advance_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_nk.consumer_release(smem_pipe_release_nk);
+      ++smem_pipe_read_nk;
+      ++smem_pipe_release_nk;
+    };
+
+    // Copy A from SMEM to register, and do transform if needed
+    auto copy_transform_A = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrA_copy_view(_,m_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA(_,m_block,k_block));
+    };
+
+    // Copy B from SMEM to register, and do transform if needed
+    auto copy_transform_B = [&](auto n_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for B operand
+      copy(smem_tiled_copy_B, tCsB(_,n_block,k_block,smem_pipe_read_nk.index()), tCrB_copy_view(_,n_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_B(MMAOp{}, tCrB(_,n_block,k_block));
+    };
+
+    // Copy E from SMEM to register
+    auto copy_E = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for E operand
+      copy( recast<RegisterE>(tCsE(_,m_block,k_block,smem_pipe_read_mk.index())), 
+            recast<RegisterE>(tCrE_copy_view(_,m_block,k_block)));
+    };
+
+    // TILE M/N/K for one TILE block
+    constexpr auto M_BLOCK_MAX = size<1>(tCrA);
+    constexpr auto N_BLOCK_MAX = size<1>(tCrB);
+    constexpr auto K_BLOCK_MAX = size<2>(tCrA);
+    constexpr auto K_BLOCK_STEP = K_BLOCK_MAX / Int<AsymmetricKRatio>{};
+
+    // Perform mainloop gemm, when E is in SMEM.
+    auto gemm_loop_with_SmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      // Load A/B/E, then do gemm.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+            copy_E(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma, 
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrE(_,m_block,k_block)), 
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    // Perform mainloop gemm, when E is in GMEM.
+    auto gemm_loop_with_GmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy gmem->rmem for E operand
+      auto blk_coord = make_coord(get<0>(cta_tile_coord), *k_tile_iter, get<3>(cta_tile_coord));     // (BLK_M,BLK_K,L)
+      Tensor tCrE = load_E<IsF8F6F4>(mainloop_params, blk_coord, problem_shape_MNKL, thread_idx);
+      ++k_tile_iter;
+
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma, 
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrE(_,m_block,k_block)), 
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline_nk
+      release_advance_nk();
+      // Wait next buffer
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        auto k_block_a = k_block + K_BLOCK_STEP;
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block_a);
+
+            // Gemm
+            cute::gemm(tiled_mma, 
+                      make_zip_tensor(tCrA(_,m_block,k_block_a), tCrE(_,m_block,k_block_a)), 
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // Case when A/B with same stages, and keep E in SMEM.
+      if constexpr (UseSmemE) {
+        gemm_loop_with_SmemE();
+      }
+      // Case when A/B with different stages, and keep E in GMEM.
+      else {  
+        gemm_loop_with_GmemE();
+      } // end if
+
+    } // end loop k_tile_count
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipelineMK, PipelineStateMK, MainloopPipelineNK, PipelineStateNK, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
index 15384bbc..5714151f 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -394,12 +394,12 @@ public:
     InternalSwappedStrideB dB;
 
     if constexpr (not SwapAB) {
-      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
-      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
     }
     else {
-      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
-      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
     }
 
     if constexpr (IsGroupedGemmKernel) {
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
index 61774102..8b06d58b 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -225,9 +225,10 @@ struct CollectiveMma<
     auto init_N = get<1>(init_shape);
     auto init_K = get<2>(init_shape);
     // Batches/Groups are managed by using appropriate pointers to input matrices
-    const uint32_t mock_L = 1;
-    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+    const uint32_t init_L = 1;
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
 
     InternalStrideA stride_a;
     InternalStrideB stride_b;
@@ -246,8 +247,8 @@ struct CollectiveMma<
       stride_a = args.dA;
       stride_b = args.dB;
     }
-    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,mock_L), stride_a));
-    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,mock_L), stride_b));
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
     TMA_A tma_load_a = make_tma_copy(
         GmemTiledCopyA{},
         tensor_a,
@@ -334,12 +335,12 @@ struct CollectiveMma<
     using X = Underscore;
     // Separate out problem shape for convenience
     auto [M,N,K,L] = problem_shape_MNKL;
-    const int32_t mock_L = 1;
+    const int32_t init_L = 1;
 
     // TMA requires special handling of strides to deal with coord codomain mapping
     // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                            // (n,k,l)
 
     // Make tiled views, defer the slice
     Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
index f6aae992..676382ad 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -223,8 +223,9 @@ struct CollectiveMma<
         auto init_K = get<2>(init_shape);
         auto init_L = get<3>(init_shape);
 
-    ElementA const* ptr_A_first_batch = reinterpret_cast<ElementA const*>(args.ptr_A);
-    ElementB const* ptr_B_first_batch = reinterpret_cast<ElementB const*>(args.ptr_B);
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    ElementA const* ptr_A_first_batch = reinterpret_cast<ElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    ElementB const* ptr_B_first_batch = reinterpret_cast<ElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
 
     InternalStrideA stride_a;
     InternalStrideB stride_b;
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index a55059ad..6eae3771 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -259,9 +259,10 @@ struct CollectiveMma<
     auto init_N = get<1>(init_shape);
     auto init_K = get<2>(init_shape);
     // Batches/Groups are managed by using appropriate pointers to input matrices
-    const uint32_t mock_L = 1;
-    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+    const uint32_t init_L = 1;
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
 
     InternalStrideA stride_a;
     InternalStrideB stride_b;
@@ -280,8 +281,8 @@ struct CollectiveMma<
       stride_a = args.dA;
       stride_b = args.dB;
     }
-    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,mock_L), stride_a));
-    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,mock_L), stride_b));
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
     auto tma_load_a = make_tma_copy(
          GmemTiledCopyA{},
          tensor_a,
@@ -378,12 +379,12 @@ struct CollectiveMma<
     using X = Underscore;
     // Separate out problem shape for convenience
     auto [M,N,K,L] = problem_shape_MNKL;
-    const int32_t mock_L = 1;
+    const int32_t init_L = 1;
 
     // TMA requires special handling of strides to deal with coord codomain mapping
     // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                            // (n,k,l)
 
     // Make tiled views, defer the slice
     Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
@@ -391,8 +392,8 @@ struct CollectiveMma<
     auto tK = get<3>(gA_mkl.shape());
 
     // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
-    auto scaleB_shape = make_shape(N / ScaleGranularityN, tK, L); // (scale_n,k,l)
+    auto scaleA_shape = make_shape(ceil_div(M, ScaleGranularityM), tK, L); // (scale_m,k,l)
+    auto scaleB_shape = make_shape(ceil_div(N, ScaleGranularityN), tK, L); // (scale_n,k,l)
     auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{});
     auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_0, _1, _2>{});
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index e3e4060a..e6d3c089 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -119,7 +119,7 @@ struct CollectiveMma<
   using PipelineParams = typename MainloopPipeline::Params;
 
   // Two threads per CTA are producers (1 for operand tile `tma`, and 32 for scales `cp.async`)
-  static constexpr int NumProducerThreadEvents = 33; 
+  static constexpr int NumProducerThreadEvents = 33;
 
   static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
   static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape{}) : ScaleGranularityN_;
@@ -335,9 +335,9 @@ struct CollectiveMma<
     auto tK = get<3>(gA_mkl.shape());
 
     // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+    auto scaleA_shape = make_shape(ceil_div(M, ScaleGranularityM), tK, L); // (scale_m,k,l)
     auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{});
-    auto scaleB_shape = make_shape(N / ScaleGranularityN, tK, L); // (scale_n,k,l)
+    auto scaleB_shape = make_shape(ceil_div(N, ScaleGranularityN), tK, L); // (scale_n,k,l)
     auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_0, _1, _2>{});
 
     // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
@@ -401,26 +401,26 @@ struct CollectiveMma<
     Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
     Tensor cScaleB_nkl = make_identity_tensor(mScaleB_nkl.shape());
 
-    Tensor gScaleA = local_tile( 
-      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+    Tensor gScaleA = local_tile(
+      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
       make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-    Tensor cScaleA = local_tile( 
-      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+    Tensor cScaleA = local_tile(
+      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
       make_coord(m_coord,_,l_coord));
-    Tensor gScaleB = local_tile( 
+    Tensor gScaleB = local_tile(
       mScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}),
       make_coord(n_coord,_,l_coord));                   // (ScaleNsPerTile,k,1)
-    Tensor cScaleB = local_tile( 
-      cScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}), 
+    Tensor cScaleB = local_tile(
+      cScaleB_nkl, make_tile(Int<ScaleNsPerTile>{}),
       make_coord(n_coord,_,l_coord));
 
-    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
       Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
     TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
       Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
     ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
     ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
-    
+
     Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
     Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
     Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
@@ -441,13 +441,13 @@ struct CollectiveMma<
 
     #pragma unroll
     for (int i = 0; i < size(tApA_ScaleA); ++i) {
-      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < 
+      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) <
         std::min(scales_m, (m_coord + 1) * ScaleMsPerTile);
     }
 
     #pragma unroll
     for (int i = 0; i < size(tBpB_ScaleB); ++i) {
-      tBpB_ScaleB(i) = get<0>(tBcB_ScaleB(i)) < 
+      tBpB_ScaleB(i) = get<0>(tBcB_ScaleB(i)) <
         std::min(scales_n, (n_coord + 1) * ScaleNsPerTile);
     }
 
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 9d3bb1ad..3317445e 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -392,6 +392,7 @@ public:
       // Dynamic cluster support
       [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0};
       if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 
+                    || GemmKernel::ArchTag::kMinComputeCapability == 101
                     ) {
         if constexpr (!cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape>) {
           fallback_cluster = params.hw_info.cluster_shape_fallback;
@@ -469,7 +470,9 @@ public:
         }
         
         else {
-          if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 
+          if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100
+                        || GemmKernel::ArchTag::kMinComputeCapability == 101
+                        || GemmKernel::ArchTag::kMinComputeCapability == 120
                        ) {
             if constexpr (is_static_1x1x1) {
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index e1ce0def..cc61dc8b 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -36,7 +36,6 @@
 #include "cute/layout.hpp"
 #include "cute/numeric/integral_constant.hpp" // cute::false_type
 #include "cute/arch/copy_sm100.hpp" 
-
 //////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::detail {
@@ -108,8 +107,10 @@ struct KernelCpAsyncWarpSpecializedCooperative { };
 struct KernelTma { };
 struct KernelTmaWarpSpecialized { };
 struct KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = 0;
 };
 struct KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = 0;
 };
 
 struct KernelPtrArrayTmaWarpSpecializedCooperative { };
@@ -474,6 +475,25 @@ struct KernelTmaWarpSpecializedMmaTransformSm100 final {
   static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
 };
 
+// Sparse Gemm
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelSparseTmaWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Sparse Gemm with block scaling factors
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelSparseTmaWarpSpecializedBlockScaledSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
 
 // InputTransform GEMM
 template<
@@ -516,6 +536,39 @@ struct KernelPtrArrayTmaWarpSpecializedInputTransformSm100 final {
 };
 
 
+// SM120 kernel schedules
+template< int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedCooperativeSm120 : KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedPingpongSm120 : KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+
+template< int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedCooperativeBlockScaledSm120 : KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedPingpongBlockScaledSm120 : KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template< int SchedulerPipelineStageCount_, bool isAsymmetric_>
+struct KernelTmaWarpSpecializedCooperativeSparseSm120 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr bool isAsymmetric = isAsymmetric_;
+};
+
+template< int SchedulerPipelineStageCount_, bool isAsymmetric_>
+struct KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr bool isAsymmetric = isAsymmetric_;
+};
 //////////////////////////////////////////////////////////////////////////////
 
 //
@@ -528,7 +581,7 @@ struct KernelPtrArrayTmaWarpSpecializedInputTransformSm100 final {
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// Base Dispatch Policies
+// Builder Tag Base Dispatch Policies
 struct KernelSchedule1Sm {};
 struct KernelSchedule2Sm {};
 struct KernelScheduleSm100 {};
@@ -540,6 +593,9 @@ struct KernelScheduleSm100DenseGemm : KernelScheduleSm100 {};   // Base policy
 // Dense GEMM: Specialize for 1SM vs 2SM
 struct KernelTmaWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100DenseGemm {};  // Use for 1SM Dense GEMM Kernels for Collective Mainloop Builder
 struct KernelTmaWarpSpecialized2SmSm100 final : KernelSchedule2Sm, KernelScheduleSm100DenseGemm {};  // Use for 2SM Dense GEMM Kernels for Collective Mainloop Builder
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Dense GEMM + (Ptr Array or Group GEMM)
 struct KernelScheduleSm100PtrArrayDenseGemm : KernelScheduleSm100DenseGemm {};
 // Ptr-Array Dense GEMM: Specialize for 1SM vs 2SM
@@ -560,15 +616,20 @@ struct KernelScheduleSm100PlanarComplexGemm : KernelScheduleSm100{};
 // Planar Complex GEMM: Specialize for 1SM vs 2SM
 struct KernelTmaWarpSpecialized1SmPlanarComplexSm100 final : KernelSchedule1Sm, KernelScheduleSm100PlanarComplexGemm { };
 struct KernelTmaWarpSpecialized2SmPlanarComplexSm100 final : KernelSchedule2Sm, KernelScheduleSm100PlanarComplexGemm { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array Planar Complex GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Planar Complex GEMM + (Ptr Array or Group GEMM)
 struct KernelScheduleSm100PtrArrayPlanarComplexGemm : KernelScheduleSm100PlanarComplexGemm {};
+
 struct KernelPtrArrayTmaWarpSpecialized1SmPlanarComplexSm100 final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayPlanarComplexGemm {};
 struct KernelPtrArrayTmaWarpSpecialized2SmPlanarComplexSm100 final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayPlanarComplexGemm {};
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // SM100 FastF32 (9xBF16) GEMM Dispatch Policies
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-struct KernelScheduleSm100FastFP32Gemm : KernelScheduleSm100 {};
+struct KernelScheduleSm100FastFP32Gemm           : KernelScheduleSm100 {};
 struct KernelTmaWarpSpecializedFastFP32SmemSm100 : KernelScheduleSm100FastFP32Gemm { };
 // Dispatch policies without smem load the A operand from tmem
 struct KernelTmaWarpSpecialized1SmFastFP32Sm100 final : KernelSchedule1Sm, KernelScheduleSm100FastFP32Gemm { };
@@ -576,45 +637,124 @@ struct KernelTmaWarpSpecialized2SmFastFP32Sm100 final : KernelSchedule2Sm, Kerne
 // Dispatch policies with smem load the A operand from smem
 struct KernelTmaWarpSpecialized1SmFastFP32SmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
 struct KernelTmaWarpSpecialized2SmFastFP32SmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array FastF32 (9xBF16) GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Ptr-Array Transform GEMM: Specialize for 1SM vs 2SM FastF32 GEMM
-struct KernelScheduleSm100PtrArrayFastFP32Gemm : KernelScheduleSm100FastFP32Gemm {};
+struct KernelScheduleSm100PtrArrayFastFP32Gemm           : KernelScheduleSm100FastFP32Gemm {};
 struct KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 : KernelScheduleSm100PtrArrayFastFP32Gemm { };
-struct KernelPtrArrayTmaWarpSpecialized1SmFastFP32Sm100 final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
-struct KernelPtrArrayTmaWarpSpecialized2SmFastFP32Sm100 final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
+
+struct KernelPtrArrayTmaWarpSpecialized1SmFastFP32Sm100     final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
+struct KernelPtrArrayTmaWarpSpecialized2SmFastFP32Sm100     final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
 struct KernelPtrArrayTmaWarpSpecialized1SmFastFP32SmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 { };
 struct KernelPtrArrayTmaWarpSpecialized2SmFastFP32SmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 { };
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSparseGemmSm100 : KernelScheduleSm100 {};
+// Sparse GEMM: Specialize for 1SM vs 2SM
+struct KernelSparseTmaWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSparseGemmSm100 { };
+struct KernelSparseTmaWarpSpecialized2SmSm100 final : KernelSchedule2Sm, KernelScheduleSparseGemmSm100 { };
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // SM100 BlockScaled Dense GEMM Dispatch Policies
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-struct KernelScheduleBlockScaledGemmSm100 : KernelScheduleSm100 {};                  
-struct KernelScheduleMxNvf4Sm100 : KernelScheduleBlockScaledGemmSm100 {};
-struct KernelScheduleMxf8f6f4Sm100 : KernelScheduleBlockScaledGemmSm100 {};
+struct KernelScheduleBlockScaledGemmSm100   : KernelScheduleSm100 {};                  
+struct KernelScheduleMxNvf4Sm100            : KernelScheduleBlockScaledGemmSm100 {};
+struct KernelScheduleMxf8f6f4Sm100          : KernelScheduleBlockScaledGemmSm100 {};
 // Block Scaled Dense GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
-struct KernelTmaWarpSpecialized1SmBlockScaledSm100 final : KernelSchedule1Sm, KernelScheduleBlockScaledGemmSm100 { };
-struct KernelTmaWarpSpecialized2SmBlockScaledSm100 final : KernelSchedule2Sm, KernelScheduleBlockScaledGemmSm100 { };
-struct KernelTmaWarpSpecialized1SmNvf4Sm100 final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
-struct KernelTmaWarpSpecialized2SmNvf4Sm100 final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
-struct KernelTmaWarpSpecialized1SmMxf4Sm100 final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
-struct KernelTmaWarpSpecialized2SmMxf4Sm100 final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
-struct KernelTmaWarpSpecialized1SmMxf8f6f4Sm100 final : KernelSchedule1Sm, KernelScheduleMxf8f6f4Sm100 { };
-struct KernelTmaWarpSpecialized2SmMxf8f6f4Sm100 final : KernelSchedule2Sm, KernelScheduleMxf8f6f4Sm100 { };
+struct KernelTmaWarpSpecialized1SmBlockScaledSm100       final : KernelSchedule1Sm, KernelScheduleBlockScaledGemmSm100 { };
+struct KernelTmaWarpSpecialized2SmBlockScaledSm100       final : KernelSchedule2Sm, KernelScheduleBlockScaledGemmSm100 { };
+struct KernelTmaWarpSpecialized1SmNvf4Sm100              final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized2SmNvf4Sm100              final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized1SmMxf4Sm100              final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized2SmMxf4Sm100              final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized1SmMxf8f6f4Sm100          final : KernelSchedule1Sm, KernelScheduleMxf8f6f4Sm100 { };
+struct KernelTmaWarpSpecialized2SmMxf8f6f4Sm100          final : KernelSchedule2Sm, KernelScheduleMxf8f6f4Sm100 { };
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // SM100 BlockScaled Ptr Array Dense GEMM Dispatch Policies
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // BlockScaled Dense GEMM + (Ptr Array or Group GEMM)
-struct KernelSchedulePtrArrayBlockScaledGemmSm100 : KernelScheduleBlockScaledGemmSm100 {};
-struct KernelSchedulePtrArrayMxNvf4Sm100 : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
-struct KernelSchedulePtrArrayMxf8f6f4Sm100 : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
+struct KernelSchedulePtrArrayBlockScaledGemmSm100   : KernelScheduleBlockScaledGemmSm100 {};
+struct KernelSchedulePtrArrayMxNvf4Sm100            : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
+struct KernelSchedulePtrArrayMxf8f6f4Sm100          : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
 // Ptr-Array Block Scaled Dense GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
-struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100 final : KernelSchedule1Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
-struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100 final : KernelSchedule2Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
-struct KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100 final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
-struct KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100 final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
-struct KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100 final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
-struct KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100 final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
-struct KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100 final : KernelSchedule1Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
-struct KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100 final : KernelSchedule2Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100       final : KernelSchedule1Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100       final : KernelSchedule2Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100              final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100              final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100              final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100              final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100          final : KernelSchedule1Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100          final : KernelSchedule2Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 BlockScaled Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledSparseGemmSm100 : KernelScheduleSm100 {};
+struct KernelScheduleSparseMxNvf4Sm100          : KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelScheduleSparseMxf8f6f4Sm100        : KernelScheduleBlockScaledSparseGemmSm100 {};
+// Block Scaled Sparse GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
+struct KernelSparseTmaWarpSpecialized1SmBlockScaledSm100 final : KernelSchedule1Sm, KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelSparseTmaWarpSpecialized2SmBlockScaledSm100 final : KernelSchedule2Sm, KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100    final : KernelSchedule1Sm, KernelScheduleSparseMxf8f6f4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100    final : KernelSchedule2Sm, KernelScheduleSparseMxf8f6f4Sm100 { };
+struct KernelSparseTmaWarpSpecialized1SmNvf4Sm100        final : KernelSchedule1Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmNvf4Sm100        final : KernelSchedule2Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized1SmMxf4Sm100        final : KernelSchedule1Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmMxf4Sm100        final : KernelSchedule2Sm, KernelScheduleSparseMxNvf4Sm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          SM120 Dispatch Policies
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Builder Tag Base Dispatch Policies
+struct KernelScheduleSm120 {};
+struct KernelScheduleAcc2x4Sm120 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm120DenseGemm : KernelScheduleSm120 {};
+// Dense GEMM: Specialize for instruction type
+struct KernelScheduleF8f6f4Sm120 final : KernelScheduleSm120DenseGemm {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 BlockScaled GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledGemmSm120 : KernelScheduleSm120 {};
+struct KernelScheduleMxf8f6f4Sm120        : KernelScheduleBlockScaledGemmSm120 {};
+struct KernelScheduleMxNvf4Sm120          : KernelScheduleBlockScaledGemmSm120 {};
+// Block Scaled Sparse GEMM: Specialize for instruction type, scale factor vector size.
+struct KernelTmaWarpSpecializedNvf4Sm120             final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongNvf4Sm120     final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedMxf4Sm120             final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongMxf4Sm120     final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedMxf8f6f4Sm120         final : KernelScheduleMxf8f6f4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120 final : KernelScheduleMxf8f6f4Sm120, KernelTmaWarpSpecializedPingpong { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSparseGemmSm120 : KernelScheduleSm120 {};
+// Sparse GEMM: Specialize for instruction type
+struct KernelScheduleSparseF8f6f4Sm120 final : KernelScheduleSparseGemmSm120 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 BlockScaled Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledSparseGemmSm120 : KernelScheduleSm120 {};
+struct KernelScheduleSparseMxNvf4Sm120          : KernelScheduleBlockScaledSparseGemmSm120 {};
+struct KernelScheduleSparseMxf8f6f4Sm120        : KernelScheduleBlockScaledSparseGemmSm120 {};
+// Block Scaled Sparse GEMM: Specialize for instruction type, scale factor vector size, Acc2x4
+struct KernelSparseTmaWarpSpecializedNvf4Sm120           final : KernelScheduleSparseMxNvf4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf4Sm120           final : KernelScheduleSparseMxNvf4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf8f6f4Sm120       final : KernelScheduleSparseMxf8f6f4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf8f6f4Acc2x4Sm120 final : KernelScheduleSparseMxf8f6f4Sm120, KernelScheduleAcc2x4Sm120 { };
+
 
 // n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
 template<
@@ -661,7 +801,35 @@ struct MainloopSm100TmaUmmaWarpSpecializedBlockScaled {
   using Schedule = KernelTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
 };
 
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedSparse {
+  constexpr static int Stages = Stages_;
+  constexpr static int MetadataS2TStages = 4;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelSparseTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
 
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse {
+  constexpr static int Stages = Stages_;
+  constexpr static int MetadataS2TStages = 4;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelSparseTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
 
 // n-buffer in smem, pipelined with Blackwell Fast FP32 kernel with UMMA (HwScaled) and TMA,
 // Warp specialized dynamic schedule
@@ -793,6 +961,81 @@ struct MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32 {
 
 
 
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120TmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using KernelSchedule = KernelSchedule_;
+
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+  
+  using Schedule = cute::conditional_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelSchedule>, 
+                                       KernelTmaWarpSpecializedPingpongSm120<SchedulerPipelineStageCount_>, 
+                                       KernelTmaWarpSpecializedCooperativeSm120<SchedulerPipelineStageCount_>>;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120TmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using KernelSchedule = KernelSchedule_;
+
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+
+  using Schedule = cute::conditional_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelSchedule>, 
+                                       KernelTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>, 
+                                       KernelTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>>;
+
+};
+
+template<
+  int StagesA_,
+  int StagesB_,
+  int StagesE_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm120TmaWarpSpecializedSparse {
+  constexpr static int StagesA = StagesA_;
+  constexpr static int StagesB = StagesB_;
+  constexpr static int StagesE = StagesE_;
+  constexpr static bool isAsymmetric = (StagesA != StagesB);
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm120;
+  using Schedule = KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, isAsymmetric>;
+};
+
+template<
+  int StagesA_,
+  int StagesB_,
+  int StagesE_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm120TmaWarpSpecializedSparseBlockScaled {
+  constexpr static int StagesA = StagesA_;
+  constexpr static int StagesB = StagesB_;
+  constexpr static int StagesE = StagesE_;
+  constexpr static bool isAsymmetric = (StagesA != StagesB);
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm120;
+  using Schedule = KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, isAsymmetric>;
+};
+
+
 //////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index a6c2f43c..38fab69f 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -63,11 +63,11 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 #include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp"
 #include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp"
 #include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp"
-
 #include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp"
 #include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp"
 #include "cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp"
 #include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp"
 #include "cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp"
-
+#include "cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp"
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
index 098fca9a..d6485e1b 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
@@ -142,6 +142,7 @@ public:
   using TileSchedulerParams = typename TileScheduler::Params;
 
   static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   // Warp specialization thread count per threadblock
   static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
@@ -289,17 +290,17 @@ public:
     // Epilogue
     void* epilogue_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* mainloop_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     // Tile scheduler
     void* scheduler_workspace = workspace_ptr + workspace_offset;
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     TileSchedulerParams scheduler;
     if constexpr (IsGroupedGemmKernel) {
@@ -382,16 +383,16 @@ public:
 
     // Epilogue
     workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     // Mainloop
     workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     // Tile scheduler
     workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     return workspace_size;
   }
@@ -407,7 +408,7 @@ public:
     // Epilogue
     status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -415,7 +416,7 @@ public:
     // Mainloop
     status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -425,7 +426,7 @@ public:
       args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
index 65885b8a..0055039a 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
@@ -126,6 +126,7 @@ public:
   static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
 
   static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   // Warp specialization thread count per threadblock
   static constexpr uint32_t NumSchedThreads           = NumThreadsPerWarp;                             // 1 warp
@@ -284,17 +285,17 @@ public:
     // Epilogue
     void* epilogue_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* mainloop_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     // Tile scheduler
     void* scheduler_workspace = workspace_ptr + workspace_offset;
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     return {
       args.mode,
@@ -337,16 +338,16 @@ public:
 
     // Epilogue
     workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     // Mainloop
     workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     // Tile scheduler
     workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     return workspace_size;
   }
@@ -362,7 +363,7 @@ public:
     // Epilogue
     status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -370,7 +371,7 @@ public:
     // Mainloop
     status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -380,7 +381,7 @@ public:
       args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
index 95cc663b..0f09629d 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
@@ -130,7 +130,6 @@ public:
   using TileSchedulerParams = typename TileScheduler::Params;
 
   static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
-
   static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
   static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
 
@@ -173,9 +172,6 @@ public:
   using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
   using CLCPipelineState = typename CLCPipeline::PipelineState;
 
-  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
-  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
-
   using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
       cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
 
@@ -188,16 +184,13 @@ public:
       using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
       using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
       using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
-      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
 
       alignas(16) MainloopPipelineStorage mainloop;
       alignas(16) EpiLoadPipelineStorage epi_load;
       alignas(16) LoadOrderBarrierStorage load_order;
       alignas(16) CLCPipelineStorage clc;
       alignas(16) AccumulatorPipelineStorage accumulator;
-      alignas(16) CLCThrottlePipelineStorage clc_throttle;
       alignas(16) arch::ClusterBarrier tmem_dealloc;
-      alignas(16) arch::ClusterBarrier epilogue_throttle;
     } pipelines;
 
     alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
@@ -486,7 +479,7 @@ public:
     epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
     epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
     epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
-    epi_load_pipeline_params.initializing_warp = 4;
+    epi_load_pipeline_params.initializing_warp = 1;
     EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
 
     // Epilogue Store pipeline
@@ -498,7 +491,7 @@ public:
     typename LoadOrderBarrier::Params load_order_barrier_params;
     load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
     load_order_barrier_params.group_size = NumMainloopLoadThreads;
-    load_order_barrier_params.initializing_warp = 5;
+    load_order_barrier_params.initializing_warp = 3;
     LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
 
     // CLC pipeline
@@ -517,7 +510,7 @@ public:
       clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
     }
     clc_pipeline_params.transaction_bytes = CLCResponseSize;
-    clc_pipeline_params.initializing_warp = 1;
+    clc_pipeline_params.initializing_warp = 4;
     CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
 
     // Mainloop-Epilogue pipeline
@@ -531,29 +524,13 @@ public:
     // Only one producer thread arrives on this barrier.
     accumulator_pipeline_params.producer_arv_count = 1;
     accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
-    accumulator_pipeline_params.initializing_warp = 2;
+    accumulator_pipeline_params.initializing_warp = 5;
     AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
                                              accumulator_pipeline_params,
                                              cluster_shape,
                                              cute::true_type{},   // Perform barrier init
                                              cute::false_type{}); // Delay mask calculation
 
-    // CLC throttle pipeline
-    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
-    if (WarpCategory::MainloopLoad == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
-    }
-    if (WarpCategory::Sched == warp_category) {
-      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
-    }
-    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
-    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
-    clc_throttle_pipeline_params.dst_blockid = 0;
-    clc_throttle_pipeline_params.initializing_warp = 3;
-    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
-    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
-    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
-
     // Tmem allocator
     TmemAllocator tmem_allocator{};
 
@@ -578,15 +555,6 @@ public:
       }
     }
 
-    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
-    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
-    if (WarpCategory::MMA == warp_category && lane_predicate) {
-      epilogue_throttle_barrier.init(                          NumMMAThreads +
-                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
-                                                               NumMainloopLoadThreads +
-                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0));
-    }
-
     // We need this to guarantee that the Pipeline init is visible
     // To all producers and consumer threadblocks in the cluster
     pipeline_init_arrive_relaxed(cluster_size);
@@ -633,24 +601,12 @@ public:
 
       bool do_load_order_arrive = is_epi_load_needed;
 
-      // Signal the epilogue warps to proceed once the prologue is complete
-      epilogue_throttle_barrier.arrive();
-      bool requires_clc_query = true;
-
       do {
         // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
         auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, load_inputs.k_tiles);
         auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
         auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
 
-        if constexpr (IsSchedDynamicPersistent) {
-          if (is_first_cta_in_cluster && requires_clc_query) {
-            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
-            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
-            ++clc_pipe_throttle_producer_state;
-          }
-        }
-
         // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
         auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
           mainloop_pipeline,
@@ -684,7 +640,6 @@ public:
         );
         work_tile_info = next_work_tile_info;
         cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
-        requires_clc_query = increment_pipe;
         if (increment_pipe) {
           ++clc_pipe_consumer_state;
         }
@@ -694,9 +649,6 @@ public:
     }
 
     else if (is_participant.sched) {
-      // Signal the epilogue warps to proceed once the prologue is complete
-      epilogue_throttle_barrier.arrive();
-
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
@@ -705,11 +657,6 @@ public:
 
         do {
           if (requires_clc_query) {
-            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
-            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
-            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
-            ++clc_pipe_throttle_consumer_state;
-
             // Query next clcID and update producer state
             clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
           }
@@ -751,9 +698,6 @@ public:
         tmem_storage,
         shared_storage.tensors.mainloop);
 
-      // Signal the epilogue warps to proceed once the prologue is complete
-      epilogue_throttle_barrier.arrive();
-
       do {
         auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
 
@@ -768,13 +712,6 @@ public:
           ++clc_pipe_consumer_state;
         }
 
-        // Wait for tmem accumulator buffer to become empty with a flipped phase
-        if constexpr (!IsOverlappingAccum) {
-          if (is_mma_leader_cta) {
-            accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
-          }
-        }
-
         // Accumulator stage slice
         int acc_stage = [&] () {
           if constexpr (IsOverlappingAccum) {
@@ -839,9 +776,6 @@ public:
       bool do_tail_load = false;
       int current_wave = 0;
 
-      // Signal the epilogue warps to proceed once the prologue is complete
-      epilogue_throttle_barrier.arrive();
-
       do {
         bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
 
@@ -896,10 +830,6 @@ public:
     }
 
     else if (is_participant.epilogue) {
-      // Throttle the epilogue warps to improve prologue performance
-      static constexpr int epilogue_throttle_phase_bit = 0;
-      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
-
       // Wait for tmem allocate here
       tmem_allocation_result_barrier.arrive_and_wait();
       uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
diff --git a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
new file mode 100644
index 00000000..02a5fca2
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,965 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelSparseTmaWarpSpecializedSm100> ||
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelSparseTmaWarpSpecializedBlockScaledSm100>>
+  >
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using LayoutA   = typename CollectiveMainloop::LayoutA;
+  using StrideA   = remove_cvref_t<decltype(LayoutA{}.stride())>;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using ElementE  = typename CollectiveMainloop::ElementE;
+  using LayoutE   = typename CollectiveMainloop::LayoutE;
+  using LayoutSFA = typename cutlass::detail::LayoutSFAType<CollectiveMainloop>::type;
+  using LayoutSFB = typename cutlass::detail::LayoutSFBType<CollectiveMainloop>::type;
+  using ElementSF = typename cutlass::detail::ElementSFType<CollectiveMainloop>::type;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  static constexpr bool IsBlockscaled = cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                                                            KernelSparseTmaWarpSpecializedBlockScaledSm100>;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_load_pipe_increment(CtaShape_MNK{});
+
+  // Fixup performed for split-/stream-K is done across warps in different CTAs
+  // at epilogue subtile granularity. Thus, there must be one barrier per sub-tile per
+  // epilogue warp.
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{}; 
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    MainloopLoad = 2,
+    EpilogueLoad = 3,
+    Epilogue     = 4
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t main_load = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    const uint32_t ktile_start_alignment_count = 2u;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    auto scheduler_params = [&]() {
+      if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+        return TileScheduler::to_underlying_arguments(
+            problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+            args.hw_info, args.scheduler, scheduler_workspace,
+            ktile_start_alignment_count
+            );
+      }
+      else {
+        return TileScheduler::to_underlying_arguments(
+            problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+            args.hw_info, args.scheduler, scheduler_workspace
+          );
+      }
+    }();
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      scheduler_params
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+    
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      const uint32_t ktile_start_alignment_count = 2u;
+      workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      const uint32_t ktile_start_alignment_count = 2u;
+      status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+        args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter, ktile_start_alignment_count);
+      workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static constexpr
+  dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue) ? WarpCategory(warp_idx)
+                                                                                     : WarpCategory::Epilogue;
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoad),                        // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue)                             // epilogue
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    typename MainloopPipeline::ParamsMetadata mainloop_pipeline_params_metadata;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load;
+    mainloop_pipeline_params_metadata.transaction_bytes = CollectiveMainloop::MainLoadTmaTransactionBytes;
+    mainloop_pipeline_params_metadata.metadata_transaction_bytes = CollectiveMainloop::MetadataTmaTransactionBytes;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop,
+                                       mainloop_pipeline_params,
+                                       mainloop_pipeline_params_metadata,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 1;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopLoadThreads;
+    load_order_barrier_params.initializing_warp = 3;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 4;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 5;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category) {
+      if constexpr(!IsOverlappingAccum) {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumMMAThreads);
+        }
+      }
+      else {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+        }
+        else if (lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+        }
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_init(
+        problem_shape_MNKL, shared_storage.tensors.mainloop);
+
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    //
+    // TMEM "Allocation"
+    //
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+
+      do {
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, load_inputs.k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, unused_] = collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        do {
+          if (requires_clc_query) {
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto mma_inputs = collective_mainloop.mma_init(
+        tmem_storage,
+        shared_storage.tensors.mainloop);
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_producer_state.phase() ^ 1;
+          }
+          else {
+            return accumulator_pipe_producer_state.index();
+          }
+        }();
+
+        if (is_mma_leader_cta) {
+          mainloop_pipe_consumer_state = collective_mainloop.mma(
+            cute::make_tuple(mainloop_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state),
+            collective_mainloop.slice_accumulator(tmem_storage, acc_stage),
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+            );
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+
+        auto accumulator = get<0>(collective_mainloop.slice_accumulator(tmem_storage, acc_stage));
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulator,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulator,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp
new file mode 100644
index 00000000..6a1e6a8f
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+class StaticPersistentTileScheduler100:
+public StaticPersistentTileScheduler<
+  StaticPersistentTileScheduler100
+  > {
+
+public:
+  using BaseScheduler = StaticPersistentTileScheduler<StaticPersistentTileScheduler100>;
+public:
+  using BaseScheduler::StaticPersistentTileScheduler;
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  struct CLCResponse { uint32_t data[4] = {0}; };
+
+  static constexpr bool IsDynamicPersistent = false;
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
+  using WorkTileInfo = typename BaseScheduler::WorkTileInfo;
+  using Arguments = typename BaseScheduler::Arguments;
+
+  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order) {
+
+    uint64_t cluster_id, cluster_major_offset = 0 ;
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+    int32_t minor_work_idx, major_work_idx;
+
+    minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor);
+    major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx};
+    }
+    else {
+      return {major_work_idx, minor_work_idx};
+    }
+  }
+
+  // clc_response_ptr is a placeholder; it is just to make the StaticPersistentTileScheduler100 and PersistentTileScheduler100 constructor interfaces consistent
+  CUTLASS_DEVICE explicit
+  StaticPersistentTileScheduler100(CLCResponse* /* clc_response_ptr */, Params const& params, dim3 block_id_in_cluster)
+    : BaseScheduler(params) {}
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&args, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    size_t workspace_size  = 0;
+    return workspace_size;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace_ptr, cudaStream_t stream, ProblemShape problem_shape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& arguments,
+      [[maybe_unused]] void* workspace = nullptr,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1
+      ) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = BaseScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk,
+                                                                 atom_thr_shape_mnk, cluster_shape_mnk);
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape_mnk),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    ProblemShapeMNKL problem_shape_mnkl,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    [[maybe_unused]] KernelHardwareInfo const& hw_info,
+    Arguments const& arguments,
+    [[maybe_unused]] void* workspace=nullptr,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = BaseScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  template <
+    bool IsComplex,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  fixup(
+      TiledMma const& ,
+      WorkTileInfo const&,
+      cute::Tensor<AccEngine, AccLayout>&,
+      AccumulatorPipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R) const {
+    return acc_pipe_consumer_state;
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+      Params const& params,
+      WorkTileInfo const& work_tile_info,
+      FrgTensorC& accumulators,
+      uint32_t num_barriers,
+      uint32_t barrier_idx) {
+  }
+
+};
+}
diff --git a/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
index 92ce8839..6698cb09 100755
--- a/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
@@ -29,8 +29,9 @@
  *
  **************************************************************************************************/
 
-
 #pragma once
+// Enable printing of transformation of CLC IDs into swizzled tile coordinates
+#define CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT 0
 
 #include "cute/int_tuple.hpp"
 
@@ -61,7 +62,6 @@ private:
   using UnderlyingTileScheduler = PersistentTileSchedulerSm90;
 
 public:
-
   using ClusterShape = ClusterShape_;
   using RasterOrder = UnderlyingTileScheduler::RasterOrder;
   using RasterOrderOptions = UnderlyingTileScheduler::RasterOrderOptions;
@@ -70,12 +70,11 @@ public:
   static constexpr uint32_t Stages = Stages_;
 
   // CLC response is an opaque 16B value
-  struct CLCResponse { uint32_t data[4]; };
+  struct CLCResponse { uint32_t data[4] = {0}; };
 
-  using WorkTileInfo = typename PersistentTileSchedulerSm90::WorkTileInfo;
+  using WorkTileInfo = typename UnderlyingTileScheduler::WorkTileInfo;
 
   using Params = PersistentTileSchedulerSm100Params;
-
   using Pipeline = PipelineCLCFetchAsync<Stages, ClusterShape>;
   using PipelineStorage = typename Pipeline::SharedStorage;
 
@@ -113,7 +112,7 @@ public:
       return *this;
     }
 
-    int max_swizzle_size = 1;
+    int max_swizzle_size = 0;
     RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
   };
 
@@ -143,7 +142,7 @@ public:
       problem_blocks,
       to_gemm_coord(cs),
       hw_info,
-      args.max_swizzle_size, 
+      args.max_swizzle_size,
       args.raster_order
     );
     return params;
@@ -159,7 +158,7 @@ public:
       KernelHardwareInfo const& hw_info,
       Arguments const& args,
       void* workspace = nullptr
-    ) { 
+    ) {
 
     auto selected_cluster_shape = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
 
@@ -171,7 +170,7 @@ public:
       problem_blocks,
       to_gemm_coord(selected_cluster_shape),
       hw_info,
-      args.max_swizzle_size, 
+      args.max_swizzle_size,
       args.raster_order
     );
     return params;
@@ -364,75 +363,45 @@ public:
   //
   CUTLASS_DEVICE
   PersistentTileSchedulerSm100(Params const& params)
-    : scheduler_params(params) {}
+    : params_(params) {}
 
   CUTLASS_DEVICE
   PersistentTileSchedulerSm100(CLCResponse* clc_response_ptr, Params const& params, dim3 block_id_in_cluster)
-    : clc_response_ptr_(clc_response_ptr), scheduler_params(params), block_id_in_cluster_(block_id_in_cluster) {}
+    : clc_response_ptr_(clc_response_ptr), params_(params), block_id_in_cluster_(block_id_in_cluster) {}
 
   template <class ProblemShapeMNKL, class TileShape>
   CUTLASS_DEVICE
   PersistentTileSchedulerSm100(CLCResponse* clc_response_ptr, Params const& params, ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape, dim3 block_id_in_cluster)
     : PersistentTileSchedulerSm100(clc_response_ptr, params, block_id_in_cluster) {}
-  //
-  // Data Members
-  //
-  CLCResponse *clc_response_ptr_ = nullptr;
-  Params const& scheduler_params;
-  dim3 block_id_in_cluster_;
 
   //
   // Work Tile API
   //
 
-  // Returns the initial work tile info that will be computed over
-  template <class ClusterShape>
-  CUTLASS_DEVICE
-  static WorkTileInfo
-  initial_work_tile_info(ClusterShape cluster_shape, Params const& params) {
-    WorkTileInfo work_tile{
-      static_cast<int32_t>((blockIdx.x / cute::size<0>(cluster_shape)) * cute::size<0>(cluster_shape)),
-      static_cast<int32_t>((blockIdx.y / cute::size<1>(cluster_shape)) * cute::size<1>(cluster_shape)),
-      static_cast<int32_t>((blockIdx.z / cute::size<2>(cluster_shape)) * cute::size<2>(cluster_shape)),
-      true
-    };
-
-    possibly_transpose_work_tile(work_tile, params);
-    return work_tile;
-  }
-
   // Returns the initial work tile info that will be computed over
   template <class ClusterShape>
   CUTLASS_DEVICE
   WorkTileInfo
   initial_work_tile_info(ClusterShape cluster_shape) {
-    return initial_work_tile_info(cluster_shape, scheduler_params);
+    return swizzle_and_rasterize(blockIdx.x, blockIdx.y, blockIdx.z, /*valid=*/true, /*cluster_offset_m=*/0, /*cluster_offset_n=*/0);
   }
 
   CUTLASS_DEVICE
   auto
   work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
-    // Get every cta coord in three dimensions of the cluster
-    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster_;
-    return make_coord(
-      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
-      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
-      _,
-      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
-    );
+    return make_coord(work_tile_info.M_idx, work_tile_info.N_idx, _, work_tile_info.L_idx);
   }
 
   // Convert CTA-level work tile info to cluster-level tile coord
   CUTLASS_DEVICE
   auto
   work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
-    // TileScheduler works at CTA-level, kernel works at cluster-level
-    int m_coord = idx2crd(scheduler_params.divmod_cluster_shape_m_.divide(work_tile_info.M_idx),
-                          scheduler_params.problem_tiles_m_);
-    int n_coord = idx2crd(scheduler_params.divmod_cluster_shape_n_.divide(work_tile_info.N_idx),
-                          scheduler_params.problem_tiles_n_);
+    int m_coord = idx2crd(params_.divmod_cluster_shape_m_.divide(work_tile_info.M_idx),
+                          params_.problem_tiles_m_);
+    int n_coord = idx2crd(params_.divmod_cluster_shape_n_.divide(work_tile_info.N_idx),
+                          params_.problem_tiles_n_);
     int l_coord = idx2crd(work_tile_info.L_idx,
-                          scheduler_params.problem_tiles_l_);
+                          params_.problem_tiles_l_);
     return make_coord(m_coord, n_coord, _, l_coord);
   }
 
@@ -507,11 +476,16 @@ public:
     TileSchedulerPipelineState scheduler_pipe_consumer_state) {
 
     scheduler_pipeline.consumer_wait(scheduler_pipe_consumer_state);
-    auto new_work_tile_info = get_current_work(scheduler_pipe_consumer_state);
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(&clc_response_ptr_[scheduler_pipe_consumer_state.index()]);
+    auto work_tile = work_tile_info_from_clc_response(smem_addr);
     scheduler_pipeline.consumer_release(scheduler_pipe_consumer_state);
 
+    work_tile = swizzle_and_rasterize(
+      work_tile.M_idx, work_tile.N_idx, work_tile.L_idx, work_tile.is_valid(),
+      block_id_in_cluster_.x, block_id_in_cluster_.y);
+
     // Return true to indicate that the tile scheduler pipeline state should be advanced
-    return cute::make_tuple(new_work_tile_info, true);
+    return cute::make_tuple(work_tile, true);
   }
 
   //
@@ -646,15 +620,30 @@ public:
             static_cast<uint32_t>(ctas_l)};
   }
 
+  CUTLASS_DEVICE
+  void
+  store_invalid_response(PipelineState<Stages> state) {
+    // Only writes to local CTA.
+    store_query_response(state, make_invalid_response());
+  }
 
-  // Get clcID and success bit
-  [[nodiscard]] CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work(PipelineState<Stages> state) {
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(&clc_response_ptr_[state.index()]);
-    auto work_tile = work_tile_info_from_clc_response(smem_addr);
-    possibly_transpose_work_tile(work_tile);
-    return work_tile;
+  CUTLASS_DEVICE
+  void
+  store_query_response(PipelineState<Stages> state, CLCResponse clc_response) {
+    uint32_t smem_ptr = cute::cast_smem_ptr_to_uint(&clc_response_ptr_[state.index()]);
+    asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
+                  : : "r"(smem_ptr)
+                    , "r"(clc_response.data[0])
+                    , "r"(clc_response.data[1])
+                    , "r"(clc_response.data[2])
+                    , "r"(clc_response.data[3]));
+    cutlass::arch::fence_view_async_shared();
+  }
+
+  CUTLASS_DEVICE
+  static CLCResponse
+  make_invalid_response() {
+    return CLCResponse{};
   }
 
   // Set data SMEM ptr 
@@ -690,8 +679,8 @@ public:
 
   CUTLASS_DEVICE
   static cute::tuple<int32_t, int32_t>
-  possibly_transpose_work_tile(Params::RasterOrder raster_order, int32_t M_idx, int32_t N_idx, FastDivmod divmod_cluster_shape_m, FastDivmod divmod_cluster_shape_n) {
-    if (raster_order == Params::RasterOrder::AlongN) {
+  possibly_transpose_work_tile(RasterOrder raster_order, int32_t M_idx, int32_t N_idx, FastDivmod divmod_cluster_shape_m, FastDivmod divmod_cluster_shape_n) {
+    if (raster_order == RasterOrder::AlongN) {
       int cluster_m, remainder_m, cluster_n, remainder_n;
       divmod_cluster_shape_m(cluster_m, remainder_m, M_idx);
       divmod_cluster_shape_n(cluster_n, remainder_n, N_idx);
@@ -714,8 +703,136 @@ public:
   CUTLASS_DEVICE
   void
   possibly_transpose_work_tile(WorkTileInfo& work_tile_info) {
-    possibly_transpose_work_tile(work_tile_info, scheduler_params);
+    possibly_transpose_work_tile(work_tile_info, params_);
   }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  swizzle_and_rasterize(
+      int cta_coord_m,
+      int cta_coord_n,
+      int cta_coord_l,
+      bool valid,
+      int cta_in_cluster_offset_m,
+      int cta_in_cluster_offset_n) const {
+    #if CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT == 1
+    // Save original cta_coord_m and cta_coord_n
+    int orig_cta_coord_m = cta_coord_m;
+    int orig_cta_coord_n = cta_coord_n;
+    #endif
+
+    // Swizzling is enabled if the swizzle size is greater than 0
+    if (params_.divmod_swizzle_size_.divisor > 0) {
+      //
+      // Swizzling enabled
+      //
+
+      // Swizzling is performed in terms of clusters. Convert the major and minor CTA coordinates
+      // into cluster coordinates.
+      int32_t cluster_coord_major, cluster_coord_minor, cluster_offset_m, cluster_offset_n;
+      params_.divmod_cluster_shape_m_(cluster_coord_major, cluster_offset_m, cta_coord_m);
+      params_.divmod_cluster_shape_n_(cluster_coord_minor, cluster_offset_n, cta_coord_n);
+
+      // The general swizzling transformation is performed as follows:
+      //
+      // Consider a grid of size (M,N) (in terms of clusters) that uses a swizzle size of S.
+      // For simplicity, assume that both M and N are divisible by S.
+      //
+      // Consider M=4, N=4, and S=2. We'd like to transform the original rasterization as follows
+      //
+      //                           <---- N ---->
+      //                           <- S ->
+      //  +--+--+--+--+            +--+--+--+--+  ^
+      //  |00|04|08|12|            |00|01|14|15|  |
+      //  +--+--+--+--+            +--+--+--+--+  |
+      //  |01|05|09|13|            |02|03|12|13|  |
+      //  +--+--+--+--+     --->   +--+--+--+--+  M
+      //  |02|06|10|14|            |04|05|10|11|  |
+      //  +--+--+--+--+            +--+--+--+--+  |
+      //  |03|07|11|15|            |06|07|08|09|  |
+      //  +--+--+--+--+            +--+--+--+--+  v
+      //
+      // An easy way to do this is by breaking our MxN grid into (N/S) grids of size MxS:
+      //
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |00|04|        |00|01|             |08|12|        |14|15|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |01|05|        |02|03|             |09|13|        |12|13|
+      //  +--+--+  --->  +--+--+     and     +--+--+  --->  +--+--+
+      //  |02|06|        |04|05|             |10|14|        |10|11|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |03|07|        |06|07|             |11|15|        |08|09|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //
+      // Given an M and N cluster coordinate (m,n) within one of these MxS grids, the desired remapping can
+      // be performed as:
+      //   new_m_local = (m / S) + ((M / S) * (n % S))
+      //   new_n_local = (m % S)
+      //
+      // We can map these local coordinates within the MxS subgrid to the full MxN grid by offsetting the new
+      // local N coordinate based on which subgrid we're in. We can obtain the serpantine rasterization order
+      // across subgrids by flipping the new M coordinate depending on which subgrid we're in.
+      //
+      //   new_m_global = (n / S) % 2 == 0 ? new_m_local : M - new_m_local
+      //   new_n_global = new_n_local + ((n / S) * S)
+      //
+      // In reality, we need to handle cases in which M and N are not divisible by swizzle size. In this case,
+      // we currently simply perform the swizzling transformation above for the ((M/S)*S) x ((N/S)*S) subgrid
+      // that is divisible by swizzle size, and do not remap any residual tiles.
+      //
+
+      int32_t minor_div_swizz, minor_mod_swizz;
+      params_.divmod_swizzle_size_(minor_div_swizz, minor_mod_swizz, cluster_coord_minor);
+
+      int32_t major_clusters = params_.divmod_cluster_shape_m_.divide(gridDim.x);
+
+      // Determine the first IDs in the major and minor mode that constitute "residual" space
+      int32_t major_clusters_div_swizzle = params_.divmod_swizzle_size_.divide(major_clusters);
+      int32_t first_residual_major_cluster_id = major_clusters_div_swizzle * params_.divmod_swizzle_size_.divisor;
+      int32_t minor_clusters_div_swizzle = params_.divmod_swizzle_size_.divide(params_.divmod_cluster_shape_n_.divide(gridDim.y));
+      int32_t first_residual_minor_cluster_id = minor_clusters_div_swizzle * params_.divmod_swizzle_size_.divisor;
+
+      // Only schedule via the swizzle if we're not within the residual space in either the major or minor mode.
+      int32_t new_major_coord = cluster_coord_major, new_minor_coord = cluster_coord_minor;
+      if (cluster_coord_major < first_residual_major_cluster_id && cluster_coord_minor < first_residual_minor_cluster_id) {
+        // Not a residual cluster
+        int32_t major_div_swizz, major_mod_swizz;
+        params_.divmod_swizzle_size_(major_div_swizz, major_mod_swizz, cluster_coord_major);
+
+        new_major_coord = major_div_swizz + (major_clusters_div_swizzle * minor_mod_swizz);
+        new_minor_coord = major_mod_swizz + (minor_div_swizz * params_.divmod_swizzle_size_.divisor);
+      }
+
+      // Map the swizzled cluster tile back to a CTA tile
+      cta_coord_m = new_major_coord * params_.divmod_cluster_shape_m_.divisor + cluster_offset_m;
+      cta_coord_n = new_minor_coord * params_.divmod_cluster_shape_n_.divisor + cluster_offset_n;
+    }
+    // Since we swap the grid x and y modes if raster order is AlongN, swap the M and N tile offsets when
+    // raster order is AlongN.
+    auto [new_cta_coord_m, new_cta_coord_n] = possibly_transpose_work_tile(
+      params_.raster_order_, cta_coord_m, cta_coord_n, params_.divmod_cluster_shape_m_, params_.divmod_cluster_shape_n_);
+
+    new_cta_coord_m += cta_in_cluster_offset_m;
+    new_cta_coord_n += cta_in_cluster_offset_n;
+
+    #if CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT == 1
+    if (threadIdx.x == 0) {
+      printf("B[%d,%d,%d] T=%d new=%d,%d,%d orig=%d,%d,%d valid=%d\n",
+        blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,
+        new_cta_coord_m, new_cta_coord_n, cta_coord_l,
+        orig_cta_coord_m, orig_cta_coord_n, cta_coord_l, (int)valid);
+      }
+    #endif
+
+    return {new_cta_coord_m, new_cta_coord_n, static_cast<int32_t>(cta_coord_l), valid};
+  }
+
+  //
+  // Data Members
+  //
+  CLCResponse *clc_response_ptr_ = nullptr;
+  Params const& params_;
+  dim3 block_id_in_cluster_ = {0, 0, 0};
 };
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
index 8e503353..ca853cd3 100644
--- a/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
+++ b/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
@@ -188,17 +188,6 @@ public:
     return sm100_scheduler_.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
  }
 
-  // Get clcID and success bit
-  [[nodiscard]] CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work(PipelineState<Stages> state) {
-    InternalWorkTileInfo work_tile_info = sm100_scheduler_.get_current_work(state);
-    if (!work_tile_info.is_valid()) {
-      return invalid_work_tile();
-    }
-
-    return convert_work(work_tile_info);
-  }
   // Given the inputs, computes the total number of output blocks this problem will compute over
   template<class ProblemShape>
   CUTLASS_HOST_DEVICE
@@ -255,7 +244,7 @@ public:
   CUTLASS_DEVICE
   WorkTileInfo
   initial_work_tile_info(ClusterShape cluster_shape) {
-    InternalWorkTileInfo work_tile_info = UnderlyingScheduler::initial_work_tile_info(cluster_shape, params_.sm100_params_);
+    InternalWorkTileInfo work_tile_info = sm100_scheduler_.initial_work_tile_info(cluster_shape);
     work_tile_info.is_valid_tile = false;
     return convert_work(work_tile_info);
   }
@@ -307,12 +296,15 @@ public:
       return cute::make_tuple(work_tile_info, false);
     }
 
-    clc_pipeline.consumer_wait(clc_pipe_consumer_state);
-    auto new_work_tile_info = get_current_work(clc_pipe_consumer_state);
-    clc_pipeline.consumer_release(clc_pipe_consumer_state);
+    auto [work_tile, _] = sm100_scheduler_.fetch_next_work(InternalWorkTileInfo{}, clc_pipeline, clc_pipe_consumer_state);
+    if (!work_tile.is_valid()) {
+      return cute::make_tuple(invalid_work_tile(), true);
+    }
+
+    auto converted_work_tile = convert_work(work_tile);
 
     // Return true to indicate that the CLC pipeline state should be advanced
-    return cute::make_tuple(new_work_tile_info, true);
+    return cute::make_tuple(converted_work_tile, true);
   }
 
   CUTLASS_DEVICE
@@ -689,8 +681,6 @@ public:
     }
   }
 
-
-
   // Convert CTA-level work tile info to cluster-level tile coord
   CUTLASS_DEVICE
   auto
@@ -746,9 +736,8 @@ private:
     //
     // In the case where this CTA is part of a preferred cluster, the resulting offsets are equivalent
     // to those returned by cute::block_id_in_cluster();
-    auto [cta_m_in_cluster, cta_n_in_cluster, _] = block_id_in_cluster_;
-    uint64_t cta_m_in_preferred_cluster = work_tile_info.M_idx + cta_m_in_cluster - start_cta_m_preferred_cluster;
-    uint64_t cta_n_in_preferred_cluster = work_tile_info.N_idx + cta_n_in_cluster - start_cta_n_preferred_cluster;
+    uint64_t cta_m_in_preferred_cluster = work_tile_info.M_idx - start_cta_m_preferred_cluster;
+    uint64_t cta_n_in_preferred_cluster = work_tile_info.N_idx - start_cta_n_preferred_cluster;
 
     if (params.sk_params_.raster_order_ == RasterOrder::AlongN) {
       return cluster_start_linear_id + (params.sk_params_.divmod_cluster_shape_minor_.divisor * cta_n_in_preferred_cluster) + cta_m_in_preferred_cluster;
@@ -776,9 +765,8 @@ private:
       // the first CTA tile of work for each CTA in a cluster, but later use of the
       // split-K work tile for fixup expect a CTA-offset tile. Thus, we need to offset
       // each CTA's M and N index by the CTA offset in the cluster.
-      auto [cta_m_in_cluster, cta_n_in_cluster, _] = block_id_in_cluster_;
-      auto M_idx = work_tile_info.M_idx + cta_m_in_cluster;
-      auto N_idx = work_tile_info.N_idx + cta_n_in_cluster;
+      int32_t M_idx = work_tile_info.M_idx;
+      int32_t N_idx = work_tile_info.N_idx;
 
       int L_idx, Split_idx;
       params_.sk_params_.divmod_splits_(L_idx, Split_idx, work_tile_info.L_idx);
@@ -828,8 +816,8 @@ private:
       }
 
       return {
-        static_cast<int32_t>(M_idx),
-        static_cast<int32_t>(N_idx),
+        M_idx,
+        N_idx,
         static_cast<int32_t>(K_idx),
         static_cast<int32_t>(L_idx),
         k_tiles,
diff --git a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
new file mode 100644
index 00000000..c63656c6
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
@@ -0,0 +1,901 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_asymmetric_dma_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelTmaWarpSpecializedCooperativeSparseSm120> ||
+    cutlass::detail::is_asymmetric_dma_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+
+  using TileSchedulerTag = TileSchedulerTag_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                          TileSchedulerTag, ArchTag, TileShape, ClusterShape
+                          ,TileSchedulerPipelineStageCount
+                          >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with number of KBLOCK, aka TILEK,
+  //    and STAGEs. It let AsymmetricKRatio, equals KBLOCK_A / KBLOCK_B, to control
+  //    the balance of A/B loading, make sure A/B's pipeline keep same cadence
+  //    when procude / consume data.
+  // Currently, AsymmetricKRatio = {1, 2} is the only support.
+  static constexpr bool isAsymmetric = DispatchPolicy::Schedule::isAsymmetric;
+  static constexpr uint32_t AsymmetricKRatio = isAsymmetric ? 2 : 1;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp * 2;  // 2 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = NumMmaWarpGroups;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorageMK = typename CollectiveMainloop::PipelineStorageMK;
+      using MainloopPipelineStorageNK = typename CollectiveMainloop::PipelineStorageNK;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorageMK mainloop_mk;
+      alignas(16) MainloopPipelineStorageNK mainloop_nk;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    alignas(16) TileSchedulerStorage scheduler;
+
+      struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm120_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      );
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      LoadMK = 0,
+      Warp1  = 1,
+      LoadNK = 2,
+      LoadMN = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int mma_thread_idx = thread_idx % NumMMAThreads;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + (NumMainloopLoadThreads + NumMMAThreads);
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      scheduler_throttle_pipeline_params.initializing_warp = 1;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               (producer_warp_role == ProducerWarpRole::LoadMK ||
+                producer_warp_role == ProducerWarpRole::LoadNK)) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params, ClusterShape{});
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
+    // Mainloop Load pipeline
+    using MainloopPipelineMK = typename CollectiveMainloop::MainloopPipelineMK;
+    using MainloopPipelineNK = typename CollectiveMainloop::MainloopPipelineNK;
+    typename MainloopPipelineMK::Params mainloop_pipeline_params_mk;
+    typename MainloopPipelineNK::Params mainloop_pipeline_params_nk;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadMK) {
+      mainloop_pipeline_params_mk.role = MainloopPipelineMK::ThreadCategory::Producer;
+      mainloop_pipeline_params_mk.is_leader = cute::elect_one_sync();
+      mainloop_pipeline_params_mk.transaction_bytes = params.mainloop.tma_transaction_bytes_mk;
+    }
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadNK) {
+      mainloop_pipeline_params_nk.role = MainloopPipelineNK::ThreadCategory::Producer;
+      mainloop_pipeline_params_nk.is_leader = cute::elect_one_sync();
+      mainloop_pipeline_params_nk.transaction_bytes = params.mainloop.tma_transaction_bytes_nk;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params_mk.role = MainloopPipelineMK::ThreadCategory::Consumer;
+      mainloop_pipeline_params_nk.role = MainloopPipelineNK::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params_mk.num_consumers = NumMMAThreads;
+    mainloop_pipeline_params_nk.num_consumers = NumMMAThreads;
+
+    MainloopPipelineMK mainloop_pipeline_mk(shared_storage.pipelines.mainloop_mk, mainloop_pipeline_params_mk, ClusterShape{});
+    MainloopPipelineNK mainloop_pipeline_nk(shared_storage.pipelines.mainloop_nk, mainloop_pipeline_params_nk, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadMN) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    // 2 warps (LoadMK / LoadNK) are ordered before 1 warp (LoadMN) and will signal arrival.
+    params_load_order_barrier.group_id = (
+        producer_warp_role == ProducerWarpRole::LoadMK ||
+        producer_warp_role == ProducerWarpRole::LoadNK) ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp * 2;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineStateMK mainloop_pipe_consumer_state_mk;
+    typename CollectiveMainloop::PipelineStateNK mainloop_pipe_consumer_state_nk;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    typename CollectiveMainloop::PipelineStateMK mainloop_pipe_producer_state_mk = cutlass::make_producer_start_state<MainloopPipelineMK>();
+    typename CollectiveMainloop::PipelineStateNK mainloop_pipe_producer_state_nk = cutlass::make_producer_start_state<MainloopPipelineNK>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
+    // Declare work_tile_info, then define it in each of warps that use it.
+    typename TileScheduler::WorkTileInfo work_tile_info;
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+          while (work_tile_info.is_valid()) {
+            if (requires_clc_query) {
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next clcID and update producer state
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+              work_tile_info,
+              scheduler_pipeline,
+              scheduler_pipe_consumer_state
+            );
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+            work_tile_info = next_work_tile_info;
+          }
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        }
+      } // Scheduler Producer Warp End
+      else
+      // Producer Warp to LoadMK
+      if (producer_warp_role == ProducerWarpRole::LoadMK) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load_MK(
+            params.mainloop,
+            mainloop_pipeline_mk,
+            mainloop_pipe_producer_state_mk,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state_mk.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                                );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline_mk, mainloop_pipe_producer_state_mk);
+
+      } // Producer Warp LoadMK End
+
+      // LoadNK Producer Warp
+      if (producer_warp_role == ProducerWarpRole::LoadNK) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape) * AsymmetricKRatio;
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info) * AsymmetricKRatio;
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load_NK(
+            params.mainloop,
+            mainloop_pipeline_nk,
+            mainloop_pipe_producer_state_nk,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state_nk.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                                );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline_nk, mainloop_pipe_producer_state_nk);
+
+      } // Producer Warp LoadNK End
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::LoadMN &&
+               is_epi_load_needed) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                               );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Producer Warp LoadMN End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          collective_mainloop.mma(
+            mainloop_pipeline_mk,
+            mainloop_pipe_consumer_state_mk,
+            mainloop_pipeline_nk,
+            mainloop_pipe_consumer_state_nk,
+            accumulators,
+            k_tile_iter,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop,
+            blk_coord,
+            problem_shape_MNKL
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline_mk,
+            mainloop_pipe_consumer_state_mk,
+            mainloop_pipeline_nk,
+            mainloop_pipe_consumer_state_nk,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state_mk.advance(work_k_tile_count);
+          mainloop_pipe_consumer_state_nk.advance(work_k_tile_count * AsymmetricKRatio);
+        }
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                              ,scheduler_pipeline
+                                                                              ,scheduler_pipe_consumer_state
+                                                                              );
+        work_tile_info = next_work_tile_info;
+        if constexpr (IsSchedDynamicPersistent) {
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+          }
+        }
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
index 5475b53b..18c79608 100644
--- a/include/cutlass/gemm/kernel/sm70_gemm.hpp
+++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -210,7 +210,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin
     int thread_idx = int(threadIdx.x);
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
     auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);                                        // (m,n,k,l)
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
 
     // Represent the full tensors
     Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
index 73ee5055..10dae6c1 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -115,6 +115,7 @@ public:
     "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
 
   static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
     typename detail::TileSchedulerSelector<
@@ -232,16 +233,16 @@ public:
 
     void* epilogue_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* mainloop_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* scheduler_workspace = workspace_ptr + workspace_offset;
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     TileSchedulerParams scheduler;
     if constexpr (IsGroupedGemmKernel) {
@@ -297,14 +298,14 @@ public:
     }
 
     workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     return workspace_size;
   }
@@ -320,14 +321,14 @@ public:
 
     status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
 
     status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -336,7 +337,7 @@ public:
       args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
index 882974ee..e038a4cf 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -115,6 +115,7 @@ public:
     "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
 
   static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
 
   using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
     typename detail::TileSchedulerSelector<
@@ -240,16 +241,16 @@ public:
 
     void* epilogue_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* mainloop_workspace = workspace_ptr + workspace_offset;
     workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     void* scheduler_workspace = workspace_ptr + workspace_offset;
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
 
     // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
     // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
@@ -309,14 +310,14 @@ public:
     }
 
     workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
 
     return workspace_size;
   }
@@ -332,14 +333,14 @@ public:
 
     status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
 
     status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
     workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -348,7 +349,7 @@ public:
       args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
     workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
       args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
     if (status != Status::kSuccess) {
       return status;
     }
@@ -693,7 +694,7 @@ public:
           // Converge before issuing tensormap fence release since fence is aligned
           __syncwarp();
           collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
-        
+
           load_order_barrier.wait();
         }
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index cfb6912c..c2ba8f89 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -97,6 +97,7 @@ public:
 
   static_assert(ArchTag::kMinComputeCapability >= 90);
 
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
   using TileSchedulerTag = TileSchedulerTag_;
 
   using TileScheduler = typename detail::TileSchedulerSelector<
@@ -104,15 +105,19 @@ public:
                                           ArchTag, 
                                           TileShape,
                                           ClusterShape
+                                          ,TileSchedulerPipelineStageCount
                                           >::Scheduler;
+
   using TileSchedulerArguments = typename TileScheduler::Arguments;
   using TileSchedulerParams = typename TileScheduler::Params;
   
   // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp       
   static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
   static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
   static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
 
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
   static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
 
   static constexpr uint32_t NumLoadWarpGroups = 1;
@@ -129,6 +134,12 @@ public:
   // 1 stage ordered sequence between mainloop and epilogue producer load threads
   using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
 
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+  
   // Kernel level shared memory storage
   struct SharedStorage {
     struct PipelineStorage : cute::aligned_struct<16, _1> {
@@ -140,6 +151,8 @@ public:
       alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
     } pipelines;
 
+    alignas(16) TileSchedulerStorage scheduler;
+
     struct TensorStorage : cute::aligned_struct<128, _1> {
       using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
       using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
@@ -324,10 +337,10 @@ public:
     using namespace cute;
     using X = Underscore;
 
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL))
 #  define ENABLE_SM90_KERNEL_LEVEL 1
 #endif
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
 #if ! defined(ENABLE_SM90_KERNEL_LEVEL)
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
@@ -379,6 +392,48 @@ public:
 
     CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
     bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) { 
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + NumMainloopLoadThreads + NumMMAThreads;
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      } 
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+      
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      scheduler_throttle_pipeline_params.initializing_warp = 3;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               producer_warp_role == ProducerWarpRole::Mainloop) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params);
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
     // Mainloop Load pipeline
     using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
     typename MainloopPipeline::Params mainloop_pipeline_params;
@@ -455,6 +510,9 @@ public:
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
 
     TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
     // Declare work_tile_info, then define it in each of warps that use it.
     typename TileScheduler::WorkTileInfo work_tile_info;
 
@@ -478,12 +536,48 @@ public:
       work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
 
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        if constexpr (IsSchedDynamicPersistent) { 
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+          while (work_tile_info.is_valid()) {
+
+            if (requires_clc_query) {
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next work tile
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+              work_tile_info,
+              scheduler_pipeline,
+              scheduler_pipe_consumer_state
+            );
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+
+            work_tile_info = next_work_tile_info;
+          }
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        } 
+      } // Scheduler Producer Warp End  
+      else
+
       // Mainloop Producer Warp
       if (producer_warp_role == ProducerWarpRole::Mainloop) {
         // Ensure that the prefetched kernel does not touch
         // unflushed global memory prior to this instruction
         cutlass::arch::wait_on_dependent_grids();
         bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
         while (work_tile_info.is_valid()) {
           if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
             auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
@@ -502,6 +596,12 @@ public:
           auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
           auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
 
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
           collective_mainloop.load(
             params.mainloop,
             mainloop_pipeline,
@@ -522,10 +622,18 @@ public:
             do_load_order_arrive = false;
           }
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                            scheduler_pipeline,             
+                                                                            scheduler_pipe_consumer_state
                                                                            );
 
           work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) { 
+            requires_clc_query = increment_pipe; 
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
         } // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
@@ -569,9 +677,16 @@ public:
           }
 
           // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                            scheduler_pipeline,     
+                                                                            scheduler_pipe_consumer_state
                                                                            );
           work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) { 
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
         } // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
@@ -659,9 +774,16 @@ public:
         }
 
         // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                          scheduler_pipeline,
+                                                                          scheduler_pipe_consumer_state
                                                                           );
         work_tile_info = next_work_tile_info;
+        if constexpr (IsSchedDynamicPersistent) { 
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+          }
+        }
       } // Scheduler work fetch loop
 
       if (do_store_tail) {
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
index 9c5d9d47..aead199b 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -103,16 +103,29 @@ public:
   using EpilogueParams = typename CollectiveEpilogue::Params;
 
   static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
   using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
                                           TileSchedulerTag, 
                                           ArchTag, 
                                           TileShape,
-                                          ClusterShape
+                                          ClusterShape,
+                                          TileSchedulerPipelineStageCount
                                           >::Scheduler;
+
   using TileSchedulerArguments = typename TileScheduler::Arguments;
   using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
   // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp
   static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
   static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
   static constexpr uint32_t NumLoadWarpGroups = 1;
@@ -153,6 +166,8 @@ public:
       alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
     } pipelines;
     
+    alignas(16) TileSchedulerStorage scheduler;
+
     struct TensorStorage : cute::aligned_struct<128, _1> {
       using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
       using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
@@ -333,10 +348,10 @@ public:
     using namespace cute;
     using X = Underscore;
 
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL))
 #  define ENABLE_SM90_KERNEL_LEVEL 1
 #endif
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
 #if ! defined(ENABLE_SM90_KERNEL_LEVEL)
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
@@ -379,6 +394,50 @@ public:
     }
 
 
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) { 
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + NumMainloopLoadThreads + NumMMAThreads;
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+      bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      } 
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               producer_warp_role == ProducerWarpRole::Mainloop) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params);
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
     // Mainloop Load pipeline
     using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
     typename MainloopPipeline::Params mainloop_pipeline_params;
@@ -479,10 +538,17 @@ public:
     auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
 
     TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
+
     if (warp_group_role == WarpGroupRole::Consumer1) {
 
-      // Advance 2nd Math WG to the next work tile for the startup
-      scheduler.advance_to_next_work();
+      if constexpr (not IsSchedDynamicPersistent) {
+        // Advance 2nd Math WG to the next work tile for the startup
+        scheduler.advance_to_next_work();
+      }
+
       // Advance 2nd Math WG pipeline states to the end of 1st Math WG
       mainloop_pipe_consumer_state.advance(k_tile_count);
       epi_load_pipe_consumer_state.advance(c_tile_count);
@@ -496,12 +562,59 @@ public:
     if (warp_group_role == WarpGroupRole::Producer) {
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
     
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        if constexpr (IsSchedDynamicPersistent) { 
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          while (work_tile_info.is_valid()) {
+            
+            if (requires_clc_query) {
+
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next work tile
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = 
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+            
+            work_tile_info = next_work_tile_info;
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+
+          // Terminal condition - if work_tile_info is end-of-grid, produce an extra invalid tile
+          scheduler_pipeline.producer_acquire(scheduler_pipe_producer_state);
+          scheduler.store_invalid_response(scheduler_pipe_producer_state); // Push invalid tile to smem
+          scheduler_pipeline.producer_commit(scheduler_pipe_producer_state); // Manual completion of transaction
+          ++scheduler_pipe_producer_state;
+
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        } 
+      } // Scheduler Producer Warp End  
+      else
+      
       // Mainloop Producer Warp
       if (producer_warp_role == ProducerWarpRole::Mainloop) {
         // Ensure that the prefetched kernel does not touch
         // unflushed global memory prior to this instruction
         cutlass::arch::wait_on_dependent_grids();
         bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
         while (work_tile_info.is_valid()) {
           // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
           auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
@@ -511,6 +624,12 @@ public:
 
           auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
 
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
           collective_mainloop.load(
             params.mainloop,
             mainloop_pipeline,
@@ -531,14 +650,34 @@ public:
             do_load_order_arrive = false;
           }
 
+          if constexpr (IsSchedDynamicPersistent) {  
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] =
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+            work_tile_info = next_work_tile_info;
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+          else {
           // Get next work tile
           scheduler.advance_to_next_work();
           work_tile_info = scheduler.get_current_work();
+          }
         } // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
 
+        if constexpr (IsSchedDynamicPersistent) {  
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+        }
+        
       } // Mainloop Producer Warp End
 
       // Epilogue Producer Warp
@@ -573,14 +712,32 @@ public:
             shared_storage.tensors.epilogue
           );
 
+          if constexpr (IsSchedDynamicPersistent) {  
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = 
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+          else {
           // Get next work tile
           scheduler.advance_to_next_work();
           work_tile_info = scheduler.get_current_work();
+          }
         } // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
         collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
 
+        if constexpr (IsSchedDynamicPersistent) {  
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+        }
       } // Epilogue Producer Warp End
     } // Producer Warp Group End
 
@@ -600,6 +757,26 @@ public:
       }
       #endif
       
+      if constexpr (IsSchedDynamicPersistent) {
+        // Consumer0's initial tile is static. It starts consuming the 2nd tile.
+        if (warp_group_role == WarpGroupRole::Consumer0) {
+            ++scheduler_pipe_consumer_state;
+        } 
+
+        if (warp_group_role == WarpGroupRole::Consumer1) {
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+            ++scheduler_pipe_consumer_state;
+          }
+        } 
+      }
+
       while (work_tile_info.is_valid()) {
         // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
         auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
@@ -685,9 +862,23 @@ public:
         // Cue for next Math WG's Epilogue to start
         math_wg_order_barrier.arrive();
 
+        if constexpr (IsSchedDynamicPersistent) {  
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+            ++scheduler_pipe_consumer_state;
+          }
+        }
+        else {
         // Get next work tile
         scheduler.advance_to_next_work(NumMmaWarpGroups);
         work_tile_info = scheduler.get_current_work();
+        }
       } // Scheduler work fetch loop
     } // Consumer Warp Groups End
 #endif
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
index 7a5835c1..dd90d48f 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -50,6 +50,20 @@ public:
 
   static constexpr bool IsDynamicPersistent = false;
 
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
   // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
   static CUTLASS_DEVICE
   cute::tuple<int32_t, int32_t>
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
index ea52a1b8..e3f3a22f 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -98,6 +98,19 @@ public:
   using RasterOrderOptions = typename Params::RasterOrderOptions;
   static constexpr bool IsDynamicPersistent = false;
 
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+  
   struct Arguments {
     int max_swizzle_size = 1;
     // Not applying Heuristics for Grouped problems, since largest dimension can change per group
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
index 960f917d..437f5af2 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -68,6 +68,19 @@ public:
   using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
   static constexpr bool IsDynamicPersistent = false;
 
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
   // Use a dummy barrier manager to simply get the type used to store the barrier
   using BarrierType = typename NamedBarrierManager<1>::T;
 
@@ -697,6 +710,17 @@ public:
     return cute::make_tuple(get_current_work(), true);
   }
 
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+      WorkTileInfo work_tile_info,
+      TileSchedulerPipeline& scheduler_pipeline,
+      TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+    return fetch_next_work(work_tile_info);
+  }
+
   // Returns the initial work tile info that will be computed over
   CUTLASS_DEVICE
   WorkTileInfo
diff --git a/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
index 8401fb0c..f8319b11 100644
--- a/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
@@ -127,7 +127,7 @@ public:
   CUTLASS_HOST_DEVICE
   static bool
   can_implement(Arguments const& args) {
-    return args.max_swizzle_size >= 1;
+    return args.max_swizzle_size >= 0;
   }
 
   CUTLASS_HOST_DEVICE
@@ -292,6 +292,17 @@ public:
     );
   }
 
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+      WorkTileInfo work_tile_info,
+      TileSchedulerPipeline& scheduler_pipeline,
+      TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+    return fetch_next_work(work_tile_info);
+  }
+
   CUTLASS_DEVICE
   static auto
   work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index b612165e..ce4cb674 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -52,10 +52,16 @@ struct StreamKScheduler { };
 
 struct GroupScheduler { }; // Only used for Grouped GEMMs
 
+struct DynamicPersistentScheduler { };
+
+struct StaticPersistentScheduler { };
+
 } // namespace cutlass::gemm
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp" 
+
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
 #include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"            
@@ -137,10 +143,26 @@ struct TileSchedulerSelector<
 };
 
 template <
+  class ArchTag,
   class TileShape,
-  class ClusterShape
-  , uint32_t SchedulerPipelineStageCount     
-  , class GroupProblemShape
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount     
+>
+struct TileSchedulerSelector<
+    StaticPersistentScheduler,
+    ArchTag,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+  > {
+  using Scheduler = PersistentTileSchedulerSm90;
+};
+
+template <
+  class TileShape,
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount, 
+  class GroupProblemShape
 >
 struct TileSchedulerSelector<
     GroupScheduler,
@@ -191,12 +213,10 @@ struct TileSchedulerSelector<
     TileShape,
     ClusterShape,
     SchedulerPipelineStageCount> {
-  using Scheduler = typename TileSchedulerSelector<
-      PersistentScheduler,
-      arch::Sm100,
-      TileShape,
-      ClusterShape,
-      SchedulerPipelineStageCount>::Scheduler;
+    using Scheduler = PersistentTileSchedulerSm100<
+                ClusterShape,
+                SchedulerPipelineStageCount
+                >;
 };
 
 // Default (void) for Sm100 maps to PersistentTileSchedulerSm100
@@ -252,7 +272,72 @@ struct TileSchedulerSelector<
                         SchedulerPipelineStageCount>;
 };
 
+// SM100 dynamic tile scheduler
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    DynamicPersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
 
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount
+>
+struct TileSchedulerSelector<
+    StaticPersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = StaticPersistentTileScheduler100;
+};
+
+// Default (void) for Sm120 maps to PersistentTileSchedulerSm100
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    void,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+    using Scheduler = PersistentTileSchedulerSm100<
+                ClusterShape,
+                SchedulerPipelineStageCount
+                >;
+};
+
+// PersistentScheduler for Sm120 maps to PersistentTileSchedulerSm100
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<ClusterShape, SchedulerPipelineStageCount>;
+};
+
+
+// StreamKScheduler for Sm120 maps to PersistentTileSchedulerSm100StreamK
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100StreamK<
+                        TileShape,
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp b/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp
new file mode 100644
index 00000000..b1d192c1
--- /dev/null
+++ b/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp
@@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+namespace cutlass::gemm::kernel::detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+enum class RasterOrder {
+  AlongM,
+  AlongN
+};
+
+enum class RasterOrderOptions {
+  Heuristic,
+  AlongM,
+  AlongN
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Strategies for computing reductions between CTAs computing portions of a given output tile
+enum class ReductionMode {
+  // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+  // covered by each CTA. This requires a lock to be held exclusively by the CTA that is
+  // currently accumulating.
+  //
+  // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+  Deterministic,
+
+  // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+  // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+  // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+  // the accumulated value and accumulate it into registers on top of which the epilogue will
+  // be performed).
+  //
+  // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+  // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+  // of accumulation)
+  Nondeterministic
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Strategies for decomposing the problem
+enum class DecompositionMode {
+  // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
+  Heuristic,
+  // Force a data-parallel decomposition
+  DataParallel,
+  // Force a split-K decomposition. This should be paired with setting the `splits` parameter
+  SplitK,
+  // Force a stream-K decomposition
+  StreamK
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
index 7ee5b3f8..5d12d2ef 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -41,6 +41,7 @@
 #include "cutlass/platform/platform.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/gemm_coord.h"
+#include "cutlass/gemm/kernel/tile_scheduler_detail.hpp"
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -50,43 +51,34 @@ namespace detail {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  get_max_cta_occupancy(
-    int max_sm_per_gpc,
-    GemmCoord cluster_shape,
-    int sm_count) {
-      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
-      auto cluster_size = cluster_shape.m() * cluster_shape.n();
-      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
-      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
-      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+CUTLASS_HOST_DEVICE
+static uint32_t
+get_max_cta_occupancy(int max_sm_per_gpc, GemmCoord cluster_shape, int sm_count) {
+  // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+  auto cluster_size = cluster_shape.m() * cluster_shape.n();
+  int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+  int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+  int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
 
-      // The calculation below allows for larger grid size launch for different GPUs.
-      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
-      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
-      cta_per_device += max_cta_occupancy_per_residual_gpc;
+  // The calculation below allows for larger grid size launch for different GPUs.
+  int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+  int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+  cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+  cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
+  return cta_per_device;
+}
+
+////////////////////////////////////////////////////////////////////////////////
 
-      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
-      return cta_per_device;
-   }
 //
 // Parameters for SM90 tile schedulers
 //
 
 // Parameters for SM90 persistent tile scheduler
 struct PersistentTileSchedulerSm90Params {
-
-  enum class RasterOrder {
-    AlongM,
-    AlongN
-  };
-
-  enum class RasterOrderOptions {
-    Heuristic,
-    AlongM,
-    AlongN
-  };
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
 
   FastDivmodU64Pow2 divmod_cluster_shape_major_{};
   FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
@@ -189,7 +181,7 @@ struct PersistentTileSchedulerSm90Params {
     int max_swizzle_size,
     RasterOrderOptions raster_order_option,
     bool truncate_by_problem_size=true,
-    bool bypass_sm90_occupancy_calculation=false
+    bool bypass_sm90_occupancy_calculation=false 
     ) {
 
     dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
@@ -200,7 +192,7 @@ struct PersistentTileSchedulerSm90Params {
       max_swizzle_size,
       raster_order_option,
       truncate_by_problem_size,
-      bypass_sm90_occupancy_calculation
+      bypass_sm90_occupancy_calculation 
     );
   }
 
@@ -216,7 +208,7 @@ struct PersistentTileSchedulerSm90Params {
     int max_swizzle_size,
     RasterOrderOptions raster_order_option,
     bool truncate_by_problem_size=true,
-    bool bypass_sm90_occupancy_calculation=false
+    bool bypass_sm90_occupancy_calculation=false 
     ) {
 
     int const sm_count = hw_info.sm_count;
@@ -270,6 +262,7 @@ struct PersistentTileSchedulerSm90Params {
         launch_grid.y = possibly_truncate(
             max_active_clusters * cluster_shape.n(),
             problem_blocks_total / cluster_shape.m());
+
       }
       else {
         launch_grid.x = possibly_truncate(
@@ -281,7 +274,7 @@ struct PersistentTileSchedulerSm90Params {
     }
     else {
       int cta_per_device = sm_count;
-      if (!bypass_sm90_occupancy_calculation) {
+      if (!bypass_sm90_occupancy_calculation) { 
         /*
         * Optimal grid size calculation is based on
         * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
@@ -386,43 +379,13 @@ struct PersistentTileSchedulerSm90Params {
 
 // Parameters for SM90 persistent stream-K scheduler
 struct PersistentTileSchedulerSm90StreamKParams {
+  using ReductionMode = cutlass::gemm::kernel::detail::ReductionMode;
+  using DecompositionMode = cutlass::gemm::kernel::detail::DecompositionMode;
 
-  // Strategies for computing reductions between CTAs computing portions of a given output tile
-  enum class ReductionMode {
-    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
-    // covered by each CTA. This requires a lock to be held exclusively by the CTA that is
-    // currently accumulating.
-    //
-    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
-    Deterministic,
-
-    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
-    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
-    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
-    // the accumulated value and accumulate it into registers on top of which the epilogue will
-    // be performed).
-    //
-    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
-    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
-    // of accumulation)
-    Nondeterministic
-  };
-
-  // Strategies for decomposing the problem
-  enum class DecompositionMode {
-    // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
-    Heuristic,
-    // Force a data-parallel decomposition
-    DataParallel,
-    // Force a split-K decomposition. This should be paired with setting the `splits` parameter
-    SplitK,
-    // Force a stream-K decomposition
-    StreamK
-  };
 
   using UnderlyingParams = PersistentTileSchedulerSm90Params;
-  using RasterOrder = UnderlyingParams::RasterOrder;
-  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
 
   // Cluster dimensions are typically always a power of 2, so use
   // the power-of-two variants of FastDivmod for these.
@@ -620,7 +583,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
     DecompositionMode decomposition_mode,
     void* workspace,
     const uint32_t epilogue_subtile = 1,
-    uint32_t ktile_start_alignment_count = 1u, 
+    uint32_t ktile_start_alignment_count = 1u,
     bool bypass_sm90_occupancy_calculation=false
   ) {
 
@@ -821,7 +784,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
       epilogue_subtile,
       reduction_mode,
       ktile_start_alignment_count
-      );
+    );
   }
 
   // Return the optimal decomposition result by heuristic.
@@ -888,7 +851,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
         cluster_size,
         k_tiles_per_output_tile,
         decomposition_mode,
-        ctas_per_wave_in_full_clusters
+        ctas_per_wave_in_full_clusters 
       );
       uint64_t dp_tiles = output_tiles - sk_tiles;
       // Calculate the number of work units covering the data-parallel and stream-K tiles.
@@ -1086,7 +1049,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
       max_swizzle_size,
       raster_order_option,
       /* truncate_by_problem_size = */false,
-      bypass_sm90_occupancy_calculation
+      bypass_sm90_occupancy_calculation 
     );
   }
 
@@ -1908,9 +1871,9 @@ struct PersistentTileSchedulerSm100Params {
   uint32_t problem_tiles_l_ = 0;
   FastDivmod divmod_cluster_shape_m_{};
   FastDivmod divmod_cluster_shape_n_{};
+  FastDivmod divmod_swizzle_size_{};
   RasterOrder raster_order_ = RasterOrder::AlongM;
   int32_t log_swizzle_size_ = 0;
-
   // Initializes members. This variant of the method should only be used when
   // problem_shape and tile_shape contain modes of only rank 1.
   void
@@ -1932,27 +1895,12 @@ struct PersistentTileSchedulerSm100Params {
     );
   }
 
-  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  void
-  initialize(
-    dim3 problem_blocks,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    CUTLASS_UNUSED(hw_info);
-    CUTLASS_UNUSED(max_swizzle_size);
-
-    // Cluster counters in m, n and l dimensions of the problem tiles
-    problem_tiles_m_ = problem_blocks.x / cluster_shape.m();
-    problem_tiles_n_ = problem_blocks.y / cluster_shape.n();
-    problem_tiles_l_ = problem_blocks.z;
-    divmod_cluster_shape_m_ = FastDivmod(cluster_shape.m());
-    divmod_cluster_shape_n_ = FastDivmod(cluster_shape.n());
+  void initialize_swizzle(
+      dim3 problem_blocks,
+      GemmCoord cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      int max_swizzle_size,
+      RasterOrderOptions raster_order_option) {
 
     raster_order_ = UnderlyingParams::get_rasterization_order(problem_tiles_m_, problem_tiles_n_, raster_order_option);
     if (raster_order_option == RasterOrderOptions::Heuristic && raster_order_ == RasterOrder::AlongN) {
@@ -1964,13 +1912,42 @@ struct PersistentTileSchedulerSm100Params {
       // Overflow in the swapped X dimension is not possible. At worst, there will be ((1 << 16) - 1) clusters
       // along the original Y dimension of the grid. Even if the cluster M mode is 16, the new grid X value
       // will be at most ((1 << 16) - 1) * 16, which is less than the grid X limit of ((1 << 31) - 1).
-      uint32_t cluster_m = static_cast<uint32_t>(problem_blocks.x) / static_cast<uint32_t>(cluster_shape.m());
-      uint32_t new_grid_y = cluster_m * static_cast<uint32_t>(cluster_shape.n());
+      uint32_t new_grid_y = problem_tiles_m_ * static_cast<uint32_t>(cluster_shape.n());
 
       if (new_grid_y > (1 << 16) - 1) {
         raster_order_ = RasterOrder::AlongM;
       }
     }
+
+    if (max_swizzle_size <= 1) {
+      // Set divisors directly to be zero to mark as unused
+      divmod_swizzle_size_.divisor = 0;
+    }
+    else {
+      divmod_swizzle_size_ = FastDivmod(max_swizzle_size);
+    }
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+      dim3 problem_blocks,
+      GemmCoord cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      int max_swizzle_size,
+      RasterOrderOptions raster_order_option
+  ) {
+
+    // Cluster counters in m, n and l dimensions of the problem tiles
+    problem_tiles_m_ = problem_blocks.x / cluster_shape.m();
+    problem_tiles_n_ = problem_blocks.y / cluster_shape.n();
+    problem_tiles_l_ = problem_blocks.z;
+    divmod_cluster_shape_m_ = FastDivmod(cluster_shape.m());
+    divmod_cluster_shape_n_ = FastDivmod(cluster_shape.n());
+
+    initialize_swizzle(problem_blocks, cluster_shape, hw_info, max_swizzle_size, raster_order_option);
   }
 
   // Given the inputs, computes the physical grid we should launch.
@@ -2107,7 +2084,6 @@ struct PersistentTileSchedulerSm100Params {
 
 // Parameters for SM100 persistent stream-K tile scheduler
 struct PersistentTileSchedulerSm100StreamKParams {
-
   using UnderlyingParams = PersistentTileSchedulerSm100Params;
   using UnderlyingStreamKParams = PersistentTileSchedulerSm90StreamKParams;
   using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
@@ -2197,7 +2173,7 @@ struct PersistentTileSchedulerSm100StreamKParams {
       problem_blocks,
       cluster_shape,
       hw_info,
-      max_swizzle_size,
+      0, // Override max_swizzle_size to be 0, since the SM100 stream-K scheduler handles swizzling on its own
       RasterOrderOptions::AlongM // Override raster_order to be AlongM, since the SM100 stream-K scheduler does not require grid swapping for raster order selection
     );
   }
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index 9c08dd64..190e92fc 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -219,6 +219,8 @@ public:
 
   #if defined(__CUDA_ARCH__) && ((__CUDA_ARCH__ < 800) || (__CUDA_ARCH__ == 890)) 
     static int const kVerticalVisit = true;
+  #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 1200) 
+    static int const kVerticalVisit = true;
   #else
     static int const kVerticalVisit = false;
   #endif
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index 8b614125..097605c0 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -74,7 +74,9 @@ struct integer_subbyte {
   template<class T,
     class Enable = cutlass::platform::enable_if_t<cutlass::platform::is_convertible_v<T, int>>
   >
+#if !defined(CUTLASS_EXTRA_WARNINGS)
   [[deprecated("Implicit conversion is deprecated; please use explicit construction instead")]]
+#endif
   CUTLASS_HOST_DEVICE
   integer_subbyte(T value)
       : integer_subbyte(static_cast<xint_t>(value)) {}
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index 3891eaee..de9ee5fa 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -86,6 +86,7 @@ template <> struct has_negative_zero<float> : CUTE_STL_NAMESPACE::true_type{};
 template <> struct has_negative_zero<double> : CUTE_STL_NAMESPACE::true_type{};
 template <> struct has_negative_zero<tfloat32_t> : CUTE_STL_NAMESPACE::true_type{};
 
+
 // Helper variable template
 template <typename T>
 inline constexpr bool has_negative_zero_v = has_negative_zero<T>::value;
diff --git a/include/cutlass/pipeline/sm100_pipeline.hpp b/include/cutlass/pipeline/sm100_pipeline.hpp
index ef2e80bf..7f324d31 100644
--- a/include/cutlass/pipeline/sm100_pipeline.hpp
+++ b/include/cutlass/pipeline/sm100_pipeline.hpp
@@ -581,11 +581,6 @@ public:
       , params_(params)
       , empty_barrier_ptr_(&storage.empty_barrier_[0])
       , full_barrier_ptr_(&storage.full_barrier_[0]) {
-    dim3 block_id = block_id_in_cluster();
-
-    int warp_idx = canonical_warp_idx_sync();
-    auto atom_thr_shape = AtomThrShape_MNK{};
-
     static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
     if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
       init_barriers(storage, params_, cluster_shape, mcast_direction);
@@ -1095,6 +1090,163 @@ public:
   }
 };
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA (producer - consumer) Async Pipeline classes for Blackwell Sparse UMMA
+// This is designed for the parttern that kernel has two different staged tensors. (AB and metadata)
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Producer-consumer pipeline implementation
+// for UMMA producer. In this case, UMMA barrier arrives are used
+// by producer_commit. Use case, accumulator generation as
+// the result of MMA instructions.
+template <
+  int Stages_,
+  class ClusterShape = Shape<int,int,_1>,
+  class AtomThrShape_MNK_ = Shape<_1,_1,_1>
+>
+class PipelineTmaSparseUmmaAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineTmaUmmaAsync<Stages, ClusterShape, AtomThrShape_MNK>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  struct ParamsMetadata {
+    uint32_t transaction_bytes = 0;
+    uint32_t metadata_transaction_bytes = 0;
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    Impl::init_barriers(storage, params, cluster_shape);
+  }
+
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster()) {
+    impl_.init_masks(cluster_shape, block_id_in_cluster);
+  }
+
+  // Constructor by default initializes barriers and calculates masks. 
+  // These operations can be deferred by specifying InitBarriers and InitMasks. 
+  // If deferred, user code needs to guarantee init_masks and/or init_barriers is/are called. 
+  template<typename InitBarriers = cute::true_type, typename InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaSparseUmmaAsync(SharedStorage& storage, Params params, ParamsMetadata params_metadata, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , params_metadata_(params_metadata)
+      , empty_barrier_ptr_(&storage.empty_barrier_[0])
+      , full_barrier_ptr_(&storage.full_barrier_[0]) {
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  // Customized for metadata load
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, bool load_e, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), load_e, barrier_token);
+  }
+
+  // Customized for metadata load
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state, true, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    return impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state);
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    return impl_.consumer_wait(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    return impl_.consumer_release(state);
+  }
+
+private:
+  Impl impl_;
+  Params params_;
+  ParamsMetadata params_metadata_;
+  EmptyBarrier *empty_barrier_ptr_{nullptr};
+  FullBarrier *full_barrier_ptr_{nullptr};
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, bool load_e, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+    uint32_t bytes_now = load_e ? params_metadata_.transaction_bytes + params_metadata_.metadata_transaction_bytes : params_metadata_.transaction_bytes;
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(bytes_now);
+    }
+  }
+
+};
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
diff --git a/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
index da87b910..23ad9673 100644
--- a/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
+++ b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
@@ -411,7 +411,7 @@ private:
           CUTE_UNROLL
           for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
             ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
- 
+
             // Handle negative 0
             ElementAMmaRawUnit masked_elem_A = elem_A;
             if constexpr (has_negative_zero_v<ElementA>) {
@@ -507,6 +507,9 @@ private:
     constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
     using Element = typename TensorSrc::element_type;
 
+    constexpr bool IsQmmaF6 = cute::sizeof_bits_v<Element> == 6;
+
+
     CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
     CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
     CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
@@ -515,20 +518,26 @@ private:
 
     // ValueShape
     using ValueShape = 
+      cute::conditional_t<IsQmmaF6,
+                          Shape<Int<1>, Int<1>>,
       cute::conditional_t<IsRowMajor,
                           Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
                           Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
-      ;
+      >;
 
     constexpr int ValueShapeRows = shape<0>(ValueShape{});
     constexpr int ValueShapeCols = shape<1>(ValueShape{});
 
     // ThreadShape
     using ThreadShape = 
+      cute::conditional_t<IsQmmaF6,
+                          cute::conditional_t<IsRowMajor,
+                                              Shape<Int<MaxThreadsPerBlock>, Int<1>>,
+                                              Shape<Int<1>, Int<MaxThreadsPerBlock>>>,
       cute::conditional_t<IsRowMajor,
                           Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
                           Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
-      ;
+      >;
 
     constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
     constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
@@ -548,8 +557,8 @@ private:
             for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
               const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
               const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
-              if constexpr ( (not pred)
-              ) {
+
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
                 dDst(row_i, col_i) = dSrc(row_i, col_i);
               }
               else {
@@ -574,8 +583,7 @@ private:
             for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
               const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
               const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
-              if constexpr ( (not pred)
-              ) {
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
                 dDst(row_i, col_i) = dSrc(row_i, col_i);
               }
               else {
diff --git a/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
index 18f8d241..9f23535f 100644
--- a/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
+++ b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
@@ -267,6 +267,46 @@ struct StructuredSparseCompressorSelector<
   >;
 };
 
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm100> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm120> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
 template<
   class ProblemShape,
   class ElementA,
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
index 11a21a39..485ebbe3 100644
--- a/include/cutlass/workspace.h
+++ b/include/cutlass/workspace.h
@@ -52,7 +52,11 @@ static constexpr int MinWorkspaceAlignment = 16;
 
 #if !defined(__CUDACC_RTC__)
 static Status
-zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+zero_workspace(
+    void* workspace,
+    size_t workspace_size,
+    cudaStream_t stream = nullptr,
+    [[maybe_unused]] CudaHostAdapter *cuda_adapter = nullptr) {
   if (workspace_size > 0) {
     if (workspace == nullptr) {
       CUTLASS_TRACE_HOST("  error: device workspace must not be null");
diff --git a/media/docs/blackwell_functionality.md b/media/docs/blackwell_functionality.md
index 02488a3b..f5d51bae 100644
--- a/media/docs/blackwell_functionality.md
+++ b/media/docs/blackwell_functionality.md
@@ -530,13 +530,13 @@ If the scale factor tensor exceeds M128xSF4, it indicates that there are multipl
   <img src="../images/narrow_precison_multiple_block_sf_layout.png" alt="/narrow_precison_multiple_block_sf_layout.png"/>
 </p>
 
-The creation of scale factor tensors' layouts are tedious. CUTLASS provides `Sm100BlockScaledConfig` to create these layouts easily
+The creation of scale factor tensors' layouts are tedious. CUTLASS provides `Sm1xxBlockScaledConfig` to create these layouts easily
 (See [sm100_blockscaled_layout.hpp](cutlass/include/cutlass/detail/sm100_blockscaled_layout.hpp)).
 The interface to create SFA and SFB tensor layouts is as follows:
 
 ```cpp
 auto problem_shape = make_shape(M, N, K, L);
-using SfConfig = Sm100BlockScaledConfig<SFVecSize>;
+using SfConfig = Sm1xxBlockScaledConfig<SFVecSize>;
 
 // SFA shape: ((32,4), ceil(M/128)), ((SFVecSize,4), ceil(K/4), L)
 auto layout_sfa = SfConfig::tile_atom_to_shape_SFA(problem_shape);
diff --git a/media/docs/build/building_in_windows_with_visual_studio.md b/media/docs/build/building_in_windows_with_visual_studio.md
index 2c69e1ac..7548e7c7 100644
--- a/media/docs/build/building_in_windows_with_visual_studio.md
+++ b/media/docs/build/building_in_windows_with_visual_studio.md
@@ -88,3 +88,34 @@ One work-around is to set the CMake option `CMAKE_SUPPRESS_REGENERATION=ON`.
 However, this turns off CMake's ability to detect on its own when it needs to rerun.
 As a result, one will need to know when to rerun CMake by hand.
 
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/build/building_with_clang_as_host_compiler.md b/media/docs/build/building_with_clang_as_host_compiler.md
index c5350060..b1cf6815 100644
--- a/media/docs/build/building_with_clang_as_host_compiler.md
+++ b/media/docs/build/building_with_clang_as_host_compiler.md
@@ -57,3 +57,34 @@ then one can set `CMAKE_CUDA_COMPILER` as follows.
 
 * `CMAKE_CUDA_COMPILER=${PATH_TO_CUDA_TOOLKIT}/bin/nvcc`
 
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/00_quickstart.md b/media/docs/cute/00_quickstart.md
index 05998e77..c0904528 100644
--- a/media/docs/cute/00_quickstart.md
+++ b/media/docs/cute/00_quickstart.md
@@ -117,3 +117,35 @@ The `cute::print_layout` function will display any rank-2 layout in a plain test
 The `cute::print_tensor` function will display any rank-1, rank-2, rank-3, or rank-4 tensor in a plain text multidimensional table. The values of the tensor are printed so you can verify the tile of data is what you expect after a copy, for example.
 
 The `cute::print_latex` function will print LaTeX commands that you can use to build a nicely formatted and colored tables via `pdflatex`. This work for `Layout`, `TiledCopy`, and `TiledMMA`, which can be very useful to get a sense of layout patterns and partitioning patterns within CuTe.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/01_layout.md b/media/docs/cute/01_layout.md
index 2f5ba5a5..bf4f4f73 100644
--- a/media/docs/cute/01_layout.md
+++ b/media/docs/cute/01_layout.md
@@ -535,3 +535,35 @@ this is an important difference between
 but this requires defining a custom layout.)
 Input coordinates for an `mdspan` must have the same shape as the `mdspan`;
 a multidimensional `mdspan` does not accept 1-D coordinates.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/02_layout_algebra.md b/media/docs/cute/02_layout_algebra.md
index fbadc132..d8142dbe 100644
--- a/media/docs/cute/02_layout_algebra.md
+++ b/media/docs/cute/02_layout_algebra.md
@@ -135,47 +135,21 @@ With the above, we can assume without loss of generality that `B = s:d` is a lay
 
 When `A` is integral, `A = a:b`, the result is rather trivial: `R = A o B = a:b o s:d = s:(b*d)`. But when `A` is multimodal, we need to be more careful.
 
-Put into words, `A o B = A o s:d`, for integral `s` and `d` means that we want (1) every `d`th element of `A`, and then (2) keep the first `s` of those strided elements.
+Put into words, `A o B = A o s:d`, for integral `s` and `d` means that we want (1) "remove" the first `d` elements from `A`, and then (2) "keep" the first `s` of those strided elements.
 
-1. Every `d`th element of `A` can be computed by "dividing out" the first `d` elements from the shape of `A`. For an array of integers representing the shape, this is computed as
-```cpp
-void shape_div(int* shapeA, int N, int& strideB) {
-   for (int i = 0; i < N; ++i) {
-      assert(shapeA[i] %   strideB == 0 or
-               strideB % shapeA[i] == 0);
-      int new_shape  = ceil_div(shapeA[i], strideB);
-      int new_stride = ceil_div(strideB, shapeA[i]);
-      shapeA[i] = new_shape;
-      strideB   = new_stride;
-   }
-}
-```
-which progressively "removes" the first `strideB` elements from `shapeA` starting from the left. For example,
+1. Removing the first `d` elements of `A` can be computed by progressively "dividing out" the first `d` elements from the shape of `A` starting from the left. For example,
 * `(6,2) /  2 => (3,2)`
 * `(6,2) /  3 => (2,2)`
 * `(6,2) /  6 => (1,2)`
 * `(6,2) / 12 => (1,1)`
-* `(3,6,2,8) / 6 => (1,3,2,8)`
-* `(3,6,2,8) / 9 => (1,2,2,8)`
-* `(42,16,3) / 2 => (21,16,3)`
-* `(42,16,3) / 6 => ( 7,16,3)`
+* `(3,6,2,8) /  3 => (1,3,2,8)`
+* `(3,6,2,8) /  6 => (1,3,2,8)`
+* `(3,6,2,8) /  9 => (1,2,2,8)`
+* `(3,6,2,8) / 72 => (1,1,1,4)`
 
-As you may have noticed, we can only divide shapes by certain values and get a sensible result. This is called the **divisibility condition** and is enforced by the `assert` in the above code and statically checked in CuTe when possible.
+As you may have noticed, we can only divide shapes by certain values and get a sensible result. This is called the **stride divisibility condition** and is statically checked in CuTe when possible.
 
-2. The first `s` elements of the strided `A` layout can be computed by "modding out" the first `s` elements from the shape of `A`. For an array of integers representing the shape, this is computed as
-```cpp
-void shape_mod(int* shapeA, int N, int& shapeB) {
-   for (int i = 0; i < N; ++i) {
-      assert(shapeA[i] %    shapeB == 0 or
-                shapeB % shapeA[i] == 0);
-      int new_shapeA =      min(shapeA[i], shapeB);
-      int new_shapeB = ceil_div(shapeB, shapeA[i]);
-      shapeA[i] = new_shapeA;
-      shapeB    = new_shapeB;
-   }
-}
-```
-which progressibly "keeps" the first `shapeB` elements from `shapeA` starting from the left. For example,
+2. Keeping the first `s` elements of the strided `A` layout can be computed by "modding out" the first `s` elements from the shape of `A` starting from the left. For example,
 * `(6,2) %  2 => (2,1)`
 * `(6,2) %  3 => (3,1)`
 * `(6,2) %  6 => (6,1)`
@@ -185,9 +159,9 @@ which progressibly "keeps" the first `shapeB` elements from `shapeA` starting fr
 * `(1,2,2,8) %  2 => (1,2,1,1)`
 * `(1,2,2,8) % 16 => (1,2,2,4)`
 
-Again, this operation must satisfy the divisibility condition to yield a sensible result. This is enforced by the `assert` in the above code and statically checked in CuTe when possible.
+Again, this operation must satisfy a **shape divisibility condition** to yield a sensible result and is statically checked in CuTe when possible.
 
-Clearly, CuTe does not use arrays to store shapes or strides and the above code is for explication only. CuTe works with shapes and strides as `IntTuple`s and the implementation is expressed as algorithmic `fold`s which carefully account for static and dynamic integers.
+From the above examples, we can construct the composition `(3,6,2,8):(w,x,y,z) o 16:9 = (1,2,2,4):(3*w,3*x,y,z)`.
 
 #### Example 1 -- Reshape a layout into a matrix
 
@@ -572,3 +546,35 @@ zipped_product  : ((M,N), (TileM,TileN,L,...))
 tiled_product   : ((M,N), TileM, TileN, L, ...)
 flat_product    : (M, N, TileM, TileN, L, ...)
 ```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/03_tensor.md b/media/docs/cute/03_tensor.md
index f2412d11..4a1459fa 100644
--- a/media/docs/cute/03_tensor.md
+++ b/media/docs/cute/03_tensor.md
@@ -391,3 +391,35 @@ This applies a statically shaped `Tiler` to the global memory `Tensor`, creates
 * Slice a `Tensor` to retrieve subtensors.
 
 * Partitioning is tiling and/or composition followed by slicing.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/04_algorithms.md b/media/docs/cute/04_algorithms.md
index f427e5ef..a00460ab 100644
--- a/media/docs/cute/04_algorithms.md
+++ b/media/docs/cute/04_algorithms.md
@@ -221,3 +221,35 @@ CuTe provides other algorithms.
 Their header files can be found in the
 [`include/cute/algorithm`](../../../include/cute/algorithm)
 directory.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/0t_mma_atom.md b/media/docs/cute/0t_mma_atom.md
index 6f285c26..8896d9b9 100644
--- a/media/docs/cute/0t_mma_atom.md
+++ b/media/docs/cute/0t_mma_atom.md
@@ -525,3 +525,35 @@ new m-coord:  0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27  4  5  6  7 12 13 1
 This permutes only the M-mode (in `A` and `C` accordingly) and brings the access of all threads to be contiguous in m-coordinates in the `A`-matrix. This is convenient when designing layouts for shared memory or registers, for example. The MMA instructions contained within the image above are now effectively interleaved in the logical m-coordinates. Of course, permutations in the N-mode and K-mode are also valid.
 
 To see how these `TiledMMA`s are used to partition data tensors, see the [`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md).
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/0x_gemm_tutorial.md b/media/docs/cute/0x_gemm_tutorial.md
index 008bc528..beb51523 100644
--- a/media/docs/cute/0x_gemm_tutorial.md
+++ b/media/docs/cute/0x_gemm_tutorial.md
@@ -536,3 +536,35 @@ gett(int m0, int m1, int n, int k,
 Note that the only changes are the definition of shape `M`, the definition of strides `dA` and `dC`, and the definition of the CTA Tiler `bM`. The above uses a multimodel problem shape `M = (m0,m1)` and a multimodal CTA Tiler `bM = <_64,_2>` to change which portion of the global memory tensors `A` and `C` each CTA will be responsible for computing.
 
 Similar examples can be found for CUTLASS 3.x kernels that are based on CuTe, such as [this Hopper GETT example](../../../examples/51_hopper_gett).
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/0y_predication.md b/media/docs/cute/0y_predication.md
index f764508b..faa15156 100644
--- a/media/docs/cute/0y_predication.md
+++ b/media/docs/cute/0y_predication.md
@@ -215,3 +215,35 @@ both very important in-kernel) in favor of predication.
 It's also general enough to extend to all ranks,
 all layouts of threads and data,
 and all tiling/partitioning patterns.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cute/0z_tma_tensors.md b/media/docs/cute/0z_tma_tensors.md
index 6afd895d..a7e2a012 100644
--- a/media/docs/cute/0z_tma_tensors.md
+++ b/media/docs/cute/0z_tma_tensors.md
@@ -230,4 +230,34 @@ ArithTuple(0,0) o (4,5):(_1@1,_1@0):
   (0,3)  (1,3)  (2,3)  (3,3)  (4,3)
 ```
 
+## Copyright
 
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cutlass_3x_design.md b/media/docs/cutlass_3x_design.md
index 399e200b..54d6c35c 100644
--- a/media/docs/cutlass_3x_design.md
+++ b/media/docs/cutlass_3x_design.md
@@ -114,4 +114,36 @@ CuTe layouts always maintain logical consistency, and for fully static layouts
 compile time checks that break builds if this consistency is violated.
 In this way, CuTe reifies the thread-to-data-layout mapping, 
 makes it easier to write code that is "correct by construction".
-If the code compiles, it's probably correct. 
+If the code compiles, it's probably correct.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/dependent_kernel_launch.md b/media/docs/dependent_kernel_launch.md
index 3fcbfeb2..a5d0a514 100644
--- a/media/docs/dependent_kernel_launch.md
+++ b/media/docs/dependent_kernel_launch.md
@@ -42,3 +42,35 @@ performance of kernels where we know that one of the input matricies (our weight
 kernel. In that case, we only need to wait on the prior kernels memory flush in order to load the other input matrix 
 (our activations). During our prologue, we can prefetch our weights to improve performance for memory bandwidth-bound
 problem sizes. For more informations we refer the reader to [the example](../../examples/63_hopper_gemm_with_weight_prefetch/README.md).
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/grouped_scheduler.md b/media/docs/grouped_scheduler.md
index 7a593cfd..4b86e915 100644
--- a/media/docs/grouped_scheduler.md
+++ b/media/docs/grouped_scheduler.md
@@ -386,3 +386,35 @@ not guaranteed to improve performance. In some cases, performance can
 decrease when sorting problems due to additional conflicting factors that
 affect GEMM performance. We recommend profiling your grouped kernel with
 and without sorting to see whether it helps in your case.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/ide_setup.md b/media/docs/ide_setup.md
index 90e5dc29..9b023659 100644
--- a/media/docs/ide_setup.md
+++ b/media/docs/ide_setup.md
@@ -120,3 +120,34 @@ This is usually a convenient way to configure projects, but it's not as simple f
 clang doesn't understand many of the compiler flags used by nvcc. Hence, for now, we don't recommend using
 `compile_commands.json` to configure your CUDA project.
 
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 057fd2d8..c4de675c 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -373,6 +373,47 @@ Please be noted that this feature (flexible cluster shapes within a single grid)
 
 CUTLASS 3.x kernels for Hopper and Blackwell also support a new feature called programatic dependent launch (PDL). This can be enabled with `--use-pdl`, and can overlap the epilogue of the prior kernel with the prologue of the next kernel. This can effectively hide kernel prologues. Using PDL can improve performance for back to back GEMMs. See [dependent kernel launch](dependent_kernel_launch.md) for more information. CUDA graphs can also be used (`--use-cuda-graphs`) with PDL to ensure that smaller kernels are enqueued back-to-back on a stream.
 
+## Exhaustive search mode and top-k output ranking according to performance in GFLOPS/s
+
+CUTLASS also allows a few options to enable searching best performing kernel in a broader parameter space.
+
+1. **Sorting Performance Results by GFLOPs/second**  
+   A new option enables users to sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
+
+2. **Exhaustive Search for Best Kernel Performance in GFLOPs/second**  
+   This feature allows the profiler to search for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations. It ensures that all viable configurations are considered to maximize performance.
+
+3. **Performance Search Under a Fixed GEMM Shape**  
+   This option enables exhaustive performance tuning for a specific problem size. Unlike the previous feature, this restricts the search to a fixed GEMM shape while still exploring various kernel parameters to find the best configuration.
+
+### Usage Examples
+
+#### 1. Finding the Best Performing Kernel
+
+Use the following command to conduct an exhaustive search and sort results by GFLOPs/second:
+
+```bash
+cutlass_profiler --kernels=*gemm* --enable-kernel-performance-search --sort-results-flops-per-sec
+```
+
+#### 2. Performance Optimization for a Fixed GEMM Shape
+
+To optimize kernel performance for a specific GEMM problem size:
+
+```bash
+cutlass_profiler --kernels=*gemm* --enable-best-kernel-for-fixed-shape --m=6144 --n=6144 --k=6144 --sort-results-flops-per-sec
+```
+
+To search optimized kernel performance for a series of GEMM shapes (m, n, k = 1024, 2048):
+
+```bash
+cutlass_profiler --kernels=*gemm* --enable-best-kernel-for-fixed-shape --m=1024,2048 --n=1024,2048 --k=1024,2048 --sort-results-flops-per-sec
+```
+
+It is worth noting that by enabling exhaustive performance search via `--enable-kernel-performance-search`, a user is still able and responsible to decide parameters like data distribution in argument list, for which a user can choose `--dist=uniform,min:-1,max:1,scale:-1` to initialize a tensor with floating point numbers in uniform distribution. Otherwise, those parameters will be initialized to their default values.
+
+For examples above, one can change the kernel filtering regex according to their own use cases.
+
 ## Example CUDA Core GEMM Operation
 
 Example command line for profiling SGEMM kernels is as follows:
diff --git a/python/cutlass_library/conv3x_emitter.py b/python/cutlass_library/conv3x_emitter.py
index 459df607..33d6da1a 100644
--- a/python/cutlass_library/conv3x_emitter.py
+++ b/python/cutlass_library/conv3x_emitter.py
@@ -114,6 +114,20 @@ using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
     mma_n = cta_n
     mma_k = cta_k
 
+    if operation.arch >= 100:
+      # MmaTileShape (mma_m, mma_n, mma_k) is passed to kernel mainloop where
+      # mma_m = cta_m for 1sm version and mma_m = cta_m * 2 for 2sm version.
+      # If schedule is auto and cluster size is static and cta_m % 64 == 0 and cluster_m % 2 == 0, 2sm kernel version is allocated,
+      # otherwise 1sm kernel is allocated.
+      cta_m_per_mma_instruction = 1
+      if "2sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 2
+      elif "1sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 1
+      elif operation.tile_description.cluster_shape[0] > 0 and operation.tile_description.cluster_shape[0] % 2 == 0 and cta_m % 64 == 0 :
+        cta_m_per_mma_instruction = 2
+      mma_m = cta_m * cta_m_per_mma_instruction
+
     # For all three kinds of convolutions, the tile shape's K mode
     # differs from GEMM in that needs to be wrapped in a Shape.
     # For Wgrad convolutions specifically,
@@ -170,6 +184,11 @@ using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
     cluster_n = operation.tile_description.cluster_shape[1]
 
     cta_m, cta_n, cta_k = tile_shape
+    # account for static/dynamic cluster shapes
+    if operation.arch >= 100:
+      cta_m = cta_m // cluster_m if cluster_m > 0 else cta_m
+      cta_n = cta_n // cluster_n if cluster_n > 0 else cta_n
+
     warp_count = operation.tile_description.warp_count
     epilogue_schedule = EpilogueScheduleTag[operation.epilogue_schedule]
 
diff --git a/python/cutlass_library/emit_kernel_listing.py b/python/cutlass_library/emit_kernel_listing.py
index 52598d73..5a954586 100755
--- a/python/cutlass_library/emit_kernel_listing.py
+++ b/python/cutlass_library/emit_kernel_listing.py
@@ -286,7 +286,7 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
   # TODO: randomize beta values for wider coverage
   beta_values = [0.5]
 
-  is_supported_arch = (arch in ["100a"])
+  is_supported_arch = (arch in ["100a", "101a", "120a"])
 
   is_runtime_datatype_enabled = mode == "functional_L0" and is_supported_arch
 
@@ -395,8 +395,36 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
                       f"({sm100_mma_filter_regex_2sm_runtime})|" \
                       f"({block_scaled_filter_regex_1sm})|" \
                       f"({block_scaled_filter_regex_2sm})"
+    elif arch == "101a":
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})"
+    elif arch == "120a":
+
+      # blockscaled sm120_mma kernels
+      blockscaled_sm120_mma_kernel_cta_tiles = [
+        [ '128x128' ]
+      ]
+
+      # sm120 MMA instruction shapes
+      blockscaled_sm120_mma_instruction_shapes = [
+        [ 's16x8x64gemm', 
+          's16x8x32gemm'
+        ]
+      ]
+      
+      # Restrict to two layouts to reduce L0 build and test time.
+      blockscaled_sm120_mma_layouts = [ 'tn' ]
+      filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_instruction_shapes[0], blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*"
+      
+      problem_waves = [0.5, 1.25, 2.5]
+
+      kernel_filter = f"({filter_regex_blockscaled_sm120_mma})"
     else:
-      error_message = "unsupported arch, only support sm100a"
+      error_message = "unsupported arch, only support sm100a, sm101a, sm120a"
       raise Exception(error_message)
     
     # Statically encoded kernels are still added to generated_kernels 
@@ -446,8 +474,8 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
                           f"({sm100_mma_filter_regex_2sm})|" \
                           f"({block_scaled_filter_regex_1sm})|" \
                           f"({block_scaled_filter_regex_2sm})|" 
-    # CTA tiles for super MMA - only run one tile size to reduce build/test times
-    supermma_kernel_cta_tiles = [
+    # CTA tiles for sm120 MMA - only run one tile size to reduce build/test times
+    sm120_mma_kernel_cta_tiles = [
       # h1688, s1688, i16832, i8816
       [ '256x128' ],
       # d884, c1688,
@@ -458,8 +486,8 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
       [ '64x64' ]
     ]
 
-    # super MMA instruction shapes, planar complex type excluded as they are not required
-    supermma_instruction_shapes = [
+    # sm120 MMA instruction shapes, planar complex type excluded as they are not required
+    sm120_mma_instruction_shapes = [
       [ 'h1688gemm_(?!planar_complex)',
         's1688gemm_f16',
         's1688gemm_bf16',
@@ -473,16 +501,16 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
     ]
 
     # It's not pretty, but not sure why different instructions support different tile sizes.
-    filter_regex_supermma_0 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [supermma_instruction_shapes[0], supermma_kernel_cta_tiles[0]]]) + ").*"
-    filter_regex_supermma_1 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [supermma_instruction_shapes[1], supermma_kernel_cta_tiles[1]]]) + ").*"
-    filter_regex_supermma_2 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [supermma_instruction_shapes[2], supermma_kernel_cta_tiles[2]]]) + ").*"
-    filter_regex_supermma_3 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [supermma_instruction_shapes[3], supermma_kernel_cta_tiles[3]]]) + ").*"
+    filter_regex_sm120_mma_0 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[0], sm120_mma_kernel_cta_tiles[0]]]) + ").*"
+    filter_regex_sm120_mma_1 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[1], sm120_mma_kernel_cta_tiles[1]]]) + ").*"
+    filter_regex_sm120_mma_2 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[2], sm120_mma_kernel_cta_tiles[2]]]) + ").*"
+    filter_regex_sm120_mma_3 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[3], sm120_mma_kernel_cta_tiles[3]]]) + ").*"
 
-    filter_regex_supermma = f"({filter_regex_supermma_0})|({filter_regex_supermma_1})|({filter_regex_supermma_2})|({filter_regex_supermma_3})"
+    filter_regex_sm120_mma = f"({filter_regex_sm120_mma_0})|({filter_regex_sm120_mma_1})|({filter_regex_sm120_mma_2})|({filter_regex_sm120_mma_3})"
 
     problem_waves = [0.5, 1.25, 2.5]
 
-    kernel_filter = f"({filter_regex_sm100_mma})|({filter_regex_supermma})"
+    kernel_filter = f"({filter_regex_sm100_mma})|({filter_regex_sm120_mma})"
   else:
     raise ValueError()
 
@@ -494,6 +522,8 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
 
   if is_runtime_datatype_enabled: 
     mergeable_kernel_filter_re = re.compile(mergeable_kernel_filter)
+
+
   kernel_filter_re = re.compile(kernel_filter)
   testcase_counter = 0
   kernels_emitted = 0
@@ -630,6 +660,7 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
         max_k = (cta_tile_shape_k*8) - alignment_ab_max
         problem_shapes_k = [min_k, max_k]
         sm_count = 16
+        swizzle_sizes = [0]
         # Larger k and less than half wave trigger streamk +separate reduction case to be generated
         if 'stream_k' in kernel_name:
           problem_shapes_k = [max_k, cta_tile_shape_k*32]
@@ -649,145 +680,147 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
           for beta in beta_values:
             for cluster_shape in runtime_cluster_shapes:
               for runtime_input_datatype in runtime_input_datatypes:
-                grid_size = waves * sm_count
-                cluster_shape_m, cluster_shape_n, cluster_shape_k = tuple(cluster_shape)
-                if cluster_shape_m >= cluster_shape_n:
-                  grid_m = cluster_shape_m
-                  grid_n = grid_size / grid_m
-                  grid_n = max( int((grid_n + cluster_shape_n - 1) / cluster_shape_n) * cluster_shape_n, 1)
-                else:
-                  grid_n = cluster_shape_n
-                  grid_m = grid_size / grid_n
-                  grid_m = max( int((grid_m + cluster_shape_m - 1) / cluster_shape_m) * cluster_shape_m, 1)
+                for swizzle_size in swizzle_sizes:
+                  grid_size = waves * sm_count
+                  cluster_shape_m, cluster_shape_n, cluster_shape_k = tuple(cluster_shape)
+                  if cluster_shape_m >= cluster_shape_n:
+                    grid_m = cluster_shape_m
+                    grid_n = grid_size / grid_m
+                    grid_n = max( int((grid_n + cluster_shape_n - 1) / cluster_shape_n) * cluster_shape_n, 1)
+                  else:
+                    grid_n = cluster_shape_n
+                    grid_m = grid_size / grid_n
+                    grid_m = max( int((grid_m + cluster_shape_m - 1) / cluster_shape_m) * cluster_shape_m, 1)
 
-                verification_required = False
-                if mode == "functional_L0" or mode == "functional_L1":
-                  if '_void_' not in kernel_name:
-                    verification_required = True
+                  verification_required = False
+                  if mode == "functional_L0" or mode == "functional_L1":
+                    if '_void_' not in kernel_name:
+                      verification_required = True
 
-                  m = max(int(grid_m * cta_tile_shape_m), alignment_ab_max)
-                  n = max(int(grid_n * cta_tile_shape_n), alignment_ab_max)
-                  k = int(k)
+                    m = max(int(grid_m * cta_tile_shape_m), alignment_ab_max)
+                    n = max(int(grid_n * cta_tile_shape_n), alignment_ab_max)
+                    k = int(k)
 
-                  # For functional testing, we want to perturb just a little from even shapes.
-                  # Only do this if the perturbation does not cause one of the dimensions of the
-                  # problem size to go to zero. This can occur for blockscaling kernels for which
-                  # the alignment requirements for A and B can be quite large (e.g., 256).
-                  if m > alignment_shift_m:
-                    m -= alignment_shift_m
-                  if n > alignment_shift_n:
-                    n -= alignment_shift_n
+                    # For functional testing, we want to perturb just a little from even shapes.
+                    # Only do this if the perturbation does not cause one of the dimensions of the
+                    # problem size to go to zero. This can occur for blockscaling kernels for which
+                    # the alignment requirements for A and B can be quite large (e.g., 256).
+                    if m > alignment_shift_m:
+                      m -= alignment_shift_m
+                    if n > alignment_shift_n:
+                      n -= alignment_shift_n
 
-                  if '_n32t32_' in kernel_name:
-                    continue
-                batch_count = 1
-                if mode == "functional_L0" or mode == "functional_L1" :
-                  if index_waves == 0 and index_k == 0 :
-                    batch_count = 3 if mode == "functional_L0" else 5
-                gemm_op = "gemm"
+                    if '_n32t32_' in kernel_name:
+                      continue
+                  batch_count = 1
+                  if mode == "functional_L0" or mode == "functional_L1" :
+                    if index_waves == 0 and index_k == 0 :
+                      batch_count = 3 if mode == "functional_L0" else 5
+                  gemm_op = "gemm"
 
-                profiler_reference_computing_override = profiler_reference_computing
-                if "bstensorop" in kernel_name:
-                  profiler_reference_computing_override = "--mode=trace"
-                  gemm_op = "block_scaled_gemm"
+                  profiler_reference_computing_override = profiler_reference_computing
+                  if "bstensorop" in kernel_name:
+                    profiler_reference_computing_override = "--mode=trace"
+                    gemm_op = "block_scaled_gemm"
 
-                problem_size_category = ['smallK','largeK'][index_k] + '_' + ['beta==0','beta!=0'][bool(beta)]
+                  problem_size_category = ['smallK','largeK'][index_k] + '_' + ['beta==0','beta!=0'][bool(beta)]
 
-                assert m > 0 and n > 0 and k > 0
+                  assert m > 0 and n > 0 and k > 0
 
-                # Emit per-testcase metadata for perf testing usage, eventually in perf database
-                metadata_dict = {
-                  "input_params": {
-                    'problem_size_category' : problem_size_category,
-                    'operation' : _getSubOperationType(operation),
-                    'datatype' : data_types,
-                    'layout' : layout3x,
-                    'm' : m,
-                    'n' : n,
-                    'k' : k,
-                    'beta' : beta,
-                    'flops_per_byte' : _computeFlopsPerByte(operation, m, n, k, batch_count, beta)
-                  },
-                  "runtime_params": {
-                    'ctas_per_mma_instruction' : ctas_per_mma_instruction,
-                    'tilesize_m' : cta_tile_shape_m,
-                    'tilesize_n' : cta_tile_shape_n,
-                    'tilesize_k' : cta_tile_shape_k,
-                    'cluster_shape_m' : cluster_shape_m,
-                    'cluster_shape_n' : cluster_shape_n,
-                  }
-                }
-
-                cluster_m_fallback = ctas_per_mma_instruction if dynamic_cluster else cluster_shape_m
-                cluster_n_fallback = 1 if dynamic_cluster else cluster_shape_n
-                cluster_k_fallback = 1 if dynamic_cluster else cluster_shape_k
-
-
-                if dynamic_datatype:
-                  runtime_datatype_a, runtime_datatype_b = tuple(runtime_input_datatype)
-                  metadata_dict["runtime_params"]["runtime_datatype_a"] = runtime_datatype_a
-                  metadata_dict["runtime_params"]["runtime_datatype_b"] = runtime_datatype_b
-
-                testcase_metadata = [
-                  f"cutlass_profiler --operation={gemm_op} {profiler_reference_computing_override} --error-on-no-match --error-if-nothing-is-profiled" +
-                  f" --kernels={kernel_name}" +
-                  f" --m={str(m)}" +
-                  f" --n={str(n)}" +
-                  f" --k={str(k)}" +
-                  f" --cluster_m={str(cluster_shape_m)}" +
-                  f" --cluster_n={str(cluster_shape_n)}" +
-                  f" --cluster_k={str(cluster_shape_k)}" +
-                  f" --cluster_m_fallback={str(cluster_m_fallback)}" +
-                  f" --cluster_n_fallback={str(cluster_n_fallback)}" +
-                  f" --cluster_k_fallback={str(cluster_k_fallback)}" +
-                  f" --beta={str(beta)}" +
-                  f" --batch_count={str(batch_count)}" +
-                  f" --verification-required={str(verification_required).lower()}"
-                ] \
-
-                output_dynamic_datatype = dynamic_datatype
-                if output_dynamic_datatype:
-                  testcase_metadata[0] += (f" --runtime_input_datatype_a={runtime_datatype_a}" +
-                                            f" --runtime_input_datatype_b={runtime_datatype_b}")
-
-                testcase_metadata.append(json.dumps(metadata_dict))
-                testlist_csv_rows.append(testcase_metadata)
-                testcase_counter += 1
-  
-                alpha = 1.0
-
-                if dynamic_datatype:
-                  hashed_kernel_name = transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b)
-
-                # If kernel_name is new, initialize its feature set with defaults
-                if hashed_kernel_name not in kernel_features:
-                  kernel_features[hashed_kernel_name] = {
-                    "is_support_dynamic_cluster": False,
-                    "is_support_dynamic_datatype": False,
+                  # Emit per-testcase metadata for perf testing usage, eventually in perf database
+                  metadata_dict = {
+                    "input_params": {
+                      'problem_size_category' : problem_size_category,
+                      'operation' : _getSubOperationType(operation),
+                      'datatype' : data_types,
+                      'layout' : layout3x,
+                      'm' : m,
+                      'n' : n,
+                      'k' : k,
+                      'beta' : beta,
+                      'flops_per_byte' : _computeFlopsPerByte(operation, m, n, k, batch_count, beta)
+                    },
+                    "runtime_params": {
+                      'ctas_per_mma_instruction' : ctas_per_mma_instruction,
+                      'tilesize_m' : cta_tile_shape_m,
+                      'tilesize_n' : cta_tile_shape_n,
+                      'tilesize_k' : cta_tile_shape_k,
+                      'cluster_shape_m' : cluster_shape_m,
+                      'cluster_shape_n' : cluster_shape_n,
+                    }
                   }
 
-                # Update features for the hashed kernel name
-                kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] |= dynamic_cluster
-                kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] |= dynamic_datatype
+                  cluster_m_fallback = ctas_per_mma_instruction if dynamic_cluster else cluster_shape_m
+                  cluster_n_fallback = 1 if dynamic_cluster else cluster_shape_n
+                  cluster_k_fallback = 1 if dynamic_cluster else cluster_shape_k
 
-                if hashed_kernel_name not in auditlist_csv_params_map:
-                  auditlist_csv_params_map[hashed_kernel_name] = []
 
-                audit_row_params = get_kernel_params(
-                  operation,
-                  hashed_kernel_name,
-                  (cluster_shape_m, cluster_shape_n, cluster_shape_k),
-                  (cluster_m_fallback, cluster_n_fallback, cluster_k_fallback),
-                  (m, n, k, batch_count),
-                  alpha, beta,
-                  dynamic_datatype, dynamic_cluster
-                )
+                  if dynamic_datatype:
+                    runtime_datatype_a, runtime_datatype_b = tuple(runtime_input_datatype)
+                    metadata_dict["runtime_params"]["runtime_datatype_a"] = runtime_datatype_a
+                    metadata_dict["runtime_params"]["runtime_datatype_b"] = runtime_datatype_b
 
-                auditlist_csv_params_map[hashed_kernel_name].append(audit_row_params)
+                  testcase_metadata = [
+                    f"cutlass_profiler --operation={gemm_op} {profiler_reference_computing_override} --error-on-no-match --error-if-nothing-is-profiled" +
+                    f" --kernels={kernel_name}" +
+                    f" --m={str(m)}" +
+                    f" --n={str(n)}" +
+                    f" --k={str(k)}" +
+                    f" --cluster_m={str(cluster_shape_m)}" +
+                    f" --cluster_n={str(cluster_shape_n)}" +
+                    f" --cluster_k={str(cluster_shape_k)}" +
+                    f" --cluster_m_fallback={str(cluster_m_fallback)}" +
+                    f" --cluster_n_fallback={str(cluster_n_fallback)}" +
+                    f" --cluster_k_fallback={str(cluster_k_fallback)}" +
+                    f" --beta={str(beta)}" +
+                    f" --batch_count={str(batch_count)}" +
+                    f" --swizzle_size={str(swizzle_size)}" +
+                    f" --verification-required={str(verification_required).lower()}"
+                  ] \
 
-                if hashed_kernel_name not in auditlist_csv_map:
-                  audit_row = get_kernel_features(operation, hashed_kernel_name, dynamic_datatype, runtime_input_datatype)
-                  auditlist_csv_map[hashed_kernel_name] = audit_row
+                  output_dynamic_datatype = dynamic_datatype
+                  if output_dynamic_datatype:
+                    testcase_metadata[0] += (f" --runtime_input_datatype_a={runtime_datatype_a}" +
+                                              f" --runtime_input_datatype_b={runtime_datatype_b}")
+
+                  testcase_metadata.append(json.dumps(metadata_dict))
+                  testlist_csv_rows.append(testcase_metadata)
+                  testcase_counter += 1
+    
+                  alpha = 1.0
+
+                  if dynamic_datatype:
+                    hashed_kernel_name = transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b)
+
+                  # If kernel_name is new, initialize its feature set with defaults
+                  if hashed_kernel_name not in kernel_features:
+                    kernel_features[hashed_kernel_name] = {
+                      "is_support_dynamic_cluster": False,
+                      "is_support_dynamic_datatype": False,
+                    }
+
+                  # Update features for the hashed kernel name
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] |= dynamic_cluster
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] |= dynamic_datatype
+
+                  if hashed_kernel_name not in auditlist_csv_params_map:
+                    auditlist_csv_params_map[hashed_kernel_name] = []
+
+                  audit_row_params = get_kernel_params(
+                    operation,
+                    hashed_kernel_name,
+                    (cluster_shape_m, cluster_shape_n, cluster_shape_k),
+                    (cluster_m_fallback, cluster_n_fallback, cluster_k_fallback),
+                    (m, n, k, batch_count),
+                    alpha, beta,
+                    dynamic_datatype, dynamic_cluster
+                  )
+
+                  auditlist_csv_params_map[hashed_kernel_name].append(audit_row_params)
+
+                  if hashed_kernel_name not in auditlist_csv_map:
+                    audit_row = get_kernel_features(operation, hashed_kernel_name, dynamic_datatype, runtime_input_datatype)
+                    auditlist_csv_map[hashed_kernel_name] = audit_row
 
   with open(outfile_name, 'w') as testlist_csv:
     csv_writer = csv.writer(testlist_csv, delimiter=',')
@@ -826,7 +859,7 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
       for kernel_name in kernel_name_set:
           file.write(kernel_name + "\n")
 
-  # Sort L0 and L1 kernel list and csv file to avoid mixing cutlass3.x kernels and superMMA kernels in cutlass2.x generated together.
+  # Sort L0 and L1 kernel list and csv file to avoid mixing cutlass3.x kernels and sm120_mma kernels in cutlass2.x generated together.
   if mode == "functional_L0" or mode == "functional_L1":
     # Sort the .csv file
     outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm.csv")
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 2374a131..6888a40a 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -891,6 +891,7 @@ ${compile_guard_end}
 
       if opcode_class_main in [OpcodeClass.TensorOp 
                                , OpcodeClass.BlockScaledTensorOp 
+                               , OpcodeClass.SparseTensorOp
                               ]:
         tile_shape_m = instruction_shape[0]
         tile_shape_n = instruction_shape[1]
@@ -899,6 +900,8 @@ ${compile_guard_end}
     # stage count set to zero indicates builder automatic stage selection
     if operation.tile_description.stages > 0:
       stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
+    elif opcode_class_main == OpcodeClass.SparseTensorOp and operation.arch == 100:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveoutEpi<{str(operation.procedural_name())}_epilogue>"
     else:
       stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
 
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index d70f9ee8..ef9ed167 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -195,6 +195,8 @@ def CreateGemmUniversal3xOperator(
 
   # by default, only generate the largest tile and largest alignment
   if manifest.kernel_filter == '':
+    if len(tile_descriptions) == 0:
+      return operations
     tile_descriptions = [tile_descriptions[0]]
 
   combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
@@ -211,7 +213,6 @@ def CreateGemmUniversal3xOperator(
     gemm_op_extra_args = {}
     element_compute = data_type.get("epi_type", data_type["acc_type"])
 
-    
     if "sf_type" in data_type:
       gemm_op_extra_args["ScaleFactorA"] = data_type["sf_type"]
       gemm_op_extra_args["ScaleFactorB"] = data_type["sf_type"]
@@ -963,106 +964,87 @@ class ConvOperation3x:
     return accum
 
   def short_math_name(self):
-    prefix = ''
     if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
-      prefix = 'g'
-    return prefix + DataTypeNames[self.accumulator_type()]
-
-  def is_tensor_op(self):
-    tensor_ops = [
-      OpcodeClass.TensorOp,
-      OpcodeClass.WmmaTensorOp
-    ]
-    return self.tile_description.math_instruction.opcode_class in tensor_ops
-
-  def instruction_shape_string(self):
-    math_operations_map = {
-      MathOperation.xor_popc: 'xor',
-      MathOperation.and_popc: 'and'
-    }
-    if self.is_tensor_op():
-      is0, is1, is2 = self.tile_description.math_instruction.instruction_shape
-      math_op = self.tile_description.math_instruction.math_operation
-      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
-      return f"{is0}x{is1}x{is2}{math_op_string}"
-    else:
-      return ''
-
-  def intermediate_type_string(self):
-    '''
-    Name of the distinct intermediate type used by the tensor operation,
-    or the empty string if none.
-
-    Tensor ops (opcode_clas *TensorOp) may use an intermediate data type
-    that differs from the element type of A or the accumulator type.
-    '''
-    if not self.is_tensor_op():
-      return ''
-    elif self.tile_description.math_instruction.element_a == self.A.element:
-      return ''
-    elif self.tile_description.math_instruction.element_a == self.tile_description.math_instruction.element_accumulator:
-      return ''
-    else:
-      return DataTypeNames[self.tile_description.math_instruction.element_a]
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
 
   def core_name(self):
-    inst_shape = self.instruction_shape_string()
-    intermediate_type = self.intermediate_type_string()
-    conv_kind_name = ConvKindNames[self.conv_kind]
-    return f"{self.short_math_name()}{inst_shape}{intermediate_type}{conv_kind_name}"
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
 
-  def extended_name(self):
-    core_name = self.core_name()
-    element_a = DataTypeNames[self.A.element]
-    element_b = DataTypeNames[self.B.element]
-    element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
-    element_c = DataTypeNames[self.C.element]
-    element_d = DataTypeNames[self.D.element]
-    return f"{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}"
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
 
-  def is_complex(self):
-    complex_operators = [
-      MathOperation.multiply_add_complex,
-      MathOperation.multiply_add_complex_gaussian,
-      MathOperation.multiply_add_complex_fast_f32
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and',
+    }
+
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+      OpcodeClass.BlockScaledTensorOp, 
     ]
-    return self.tile_description.math_instruction.math_operation in complex_operators
 
-  def layout_names(self):
-    '''Layout strings for A and B, respectively'''
-    if self.is_complex():
-      return (ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
-              ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)])
-    else:
-      return (ShortLayoutTypeNames[self.A.layout],
-              ShortLayoutTypeNames[self.B.layout])
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+
+    if is_tensor_op:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, ConvKindNames[self.conv_kind])
 
   def extended_name(self):
-    core_name = self.core_name()
-    element_a = DataTypeNames[self.A.element]
-    element_b = DataTypeNames[self.B.element]
-    element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
-    element_c = DataTypeNames[self.C.element]
-    element_d = DataTypeNames[self.D.element]
-    layout_a, layout_b = self.layout_names()
-    return f"{core_name}_{element_a}{layout_a}_{element_b}{layout_b}_{element_acc}_{element_c}_{element_d}"
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    extended_name = "{core_name}_{element_a}{layout_a}_{element_b}{layout_b}_{element_acc}_{element_c}_{element_d}{layout_c}".format(
+      element_a = DataTypeNames[self.A.element],
+      layout_a = ShortLayoutTypeNames[self.A.layout],
+      element_b = DataTypeNames[self.B.element],
+      layout_b = ShortLayoutTypeNames[self.B.layout],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      layout_c = ShortLayoutTypeNames[self.C.layout],
+      element_d = DataTypeNames[self.D.element],
+      core_name = self.core_name())
 
+    return extended_name
+
+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
+
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name(self):
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
+  
+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+  # Generates the full kernel function name
   def configuration_name(self):
-    prefix = 'cutlass3x'
-    arch = self.arch
-    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-    tbm = self.tile_description.tile_shape[0]
-    tbn = self.tile_description.tile_shape[1]
-    tbk = self.tile_description.tile_shape[2]
-    cm = self.tile_description.cluster_shape[0]
-    cn = self.tile_description.cluster_shape[1]
-    ck = self.tile_description.cluster_shape[2]
-    alignment = max(self.A.alignment, self.B.alignment)
-    tile_scheduler = TileSchedulerSuffixes[self.tile_scheduler]
-    kernel_schedule = KernelScheduleSuffixes[self.kernel_schedule]
-    epilogue_schedule = EpilogueScheduleSuffixes[self.epilogue_schedule]
-
-    return f"{prefix}_sm{arch}_{opcode_class_name}_{self.extended_name()}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{self.tile_description.stages}_align{alignment}{tile_scheduler}{kernel_schedule}{epilogue_schedule}"
+    ''' The full function name indicates architecture, extended name, tile size, and layout. '''
+    kernel_name_template = "cutlass3x_sm{ar}_{op}_{ex}{ct}{cs}_{l}_align{al}{t}{k}{e}"
+    return kernel_name_template.format(
+        ar = self.arch,
+        op = self.opcode_class_name(),
+        ex = self.extended_name(),
+        ct = '_' + 'x'.join([str(i) for i in self.tile_description.tile_shape]) if self.tile_description.tile_shape[0] > 0 else "",
+        cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+        l = self.tile_description.stages,
+        al = str(max(self.A.alignment, self.B.alignment)),
+        t = TileSchedulerSuffixes[self.tile_scheduler],
+        k = self.kernel_schedule_name(),
+        e = self.epilogue_schedule_name())
 
   def procedural_name(self):
     return self.configuration_name()
@@ -5015,8 +4997,8 @@ def GenerateSM89_TensorOp_16832_fp8(manifest, element_acc):
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([256, 128,  64],  6, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
@@ -6695,7 +6677,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   math_instructions_1sm = [
     # tf32 -> f32
     MathInstruction(
@@ -6803,7 +6785,7 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   grouped = is_grouped(gemm_kind)
 
   math_instructions_1sm = [
@@ -7107,7 +7089,7 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   epi_type = DataType.f32
   grouped = is_grouped(gemm_kind)
 
@@ -7553,8 +7535,7 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 130
-
+  max_cc = 101
   epi_type = DataType.f32
 
   math_instructions_1sm = []
@@ -7776,7 +7757,7 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
       return [TileSchedulerType.Default, TileSchedulerType.StreamK]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   epi_type = DataType.f32
 
   math_instructions_1sm = []
@@ -8005,10 +7986,12 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
   ]
 
   instruction_sizes_1sm = [
+    [128, 64, 64], 
     [128, 128, 64], 
   ]
 
   instruction_sizes_2sm = [
+    [256, 64, 64], 
     [256, 128, 64], 
     [256, 192, 64], [256, 256, 64]
   ]
@@ -8029,7 +8012,7 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
       return [TileSchedulerType.Default, TileSchedulerType.StreamK]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   epi_type = DataType.f32
 
   math_instructions_1sm = []
@@ -8352,7 +8335,8 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
+
   epi_type = DataType.f32
 
   math_instructions_1sm = [
@@ -8545,6 +8529,679 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
         [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
 
 
+def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  min_cc = 100
+  max_cc = 101
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.f32,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  min_cc = 100
+  max_cc = 101
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.f16,
+      "b_type"   : DataType.f16,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.f16,
+      "b_type"   : DataType.f16,
+      "c_type"   : DataType.f16,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  min_cc = 100
+  max_cc = 101
+
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.s8,
+      "b_type"   : DataType.s8,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.s8,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.s8,
+      "b_type"   : DataType.s8,
+      "c_type"   : DataType.s8,
+      "d_type"   : DataType.s8,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add)]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  min_cc = 100
+  max_cc = 101
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  kernel_data_types = [
+    # NOTE: a/b type in kernel will be overwrite below.
+    #* void_c
+    # f8_f8_f32_void_f16
+    {
+      "a_type"   : DataType.e4m3,
+      "b_type"   : DataType.e4m3,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    #* non-void_c
+    # f8_f8_f32_f16_f8
+    {
+      "a_type"   : DataType.e4m3,
+      "b_type"   : DataType.e4m3,
+      "c_type"   : DataType.f16,
+      "d_type"   : DataType.e4m3,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    # Runtime DType
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    # Runtime DType
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update input AB type
+      kernel_data_type["a_type"] = math_inst.element_a
+      kernel_data_type["b_type"] = math_inst.element_b
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update input AB type
+      kernel_data_type["a_type"] = math_inst.element_a
+      kernel_data_type["b_type"] = math_inst.element_b
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  min_cc = 100
+  max_cc = 101
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  math_instructions_1sm = [
+    # Runtime Dtype
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    # Runtime DType
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      # void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+      # none void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+    ]
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_filtered = []
+      for layout in layouts:
+        layout_filter = copy.deepcopy(layout)
+        # * A_K : Logical TileShape_K % 256 == 0
+        # * A_M : TileShape_M % 128 == 0
+        # * B_N : TileSize_N % 128 == 0
+        # * B_K : TileSize_K % 128 == 0
+        if ((layout_filter[0][0] == LayoutType.RowMajor and (math_inst.instruction_shape[2] * 2) % 256 == 0) or \
+            (layout_filter[0][0] == LayoutType.ColumnMajor and math_inst.instruction_shape[0] % 128 == 0)) and \
+           ((layout_filter[1][0] == LayoutType.RowMajor and math_inst.instruction_shape[1] % 128 == 0) or \
+            (layout_filter[1][0] == LayoutType.ColumnMajor and (math_inst.instruction_shape[0] * 2) % 128 == 0)):
+          # alignment for a, 2 for sparsity
+          layout_filter[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+          # alignment for b
+          layout_filter[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+          # alignment for d
+          layout_filter[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+          layouts_filtered.append(layout_filter)
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_filtered, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      # void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+      # none void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+    ]
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_filtered = []
+      for layout in layouts:
+        layout_filter = copy.deepcopy(layout)
+        # * A_K : Logical TileShape_K % 256 == 0
+        # * A_M : TileShape_M % 128 == 0
+        # * B_N : TileSize_N % 256 == 0
+        # * B_K : TileSize_K % 128 == 0
+        if ((layout_filter[0][0] == LayoutType.RowMajor and (math_inst.instruction_shape[2] * 2) % 256 == 0) or \
+            (layout_filter[0][0] == LayoutType.ColumnMajor and math_inst.instruction_shape[0] % 128 == 0)) and \
+           ((layout_filter[1][0] == LayoutType.RowMajor and math_inst.instruction_shape[1] % 256 == 0) or \
+            (layout_filter[1][0] == LayoutType.ColumnMajor and (math_inst.instruction_shape[0] * 2) % 128 == 0)):
+          # alignment for a, 2 for sparsity
+          layout_filter[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+          # alignment for b
+          layout_filter[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+          # alignment for d
+          layout_filter[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+          layouts_filtered.append(layout_filter)
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_filtered, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
 
 #
 # Kernels using the stream-K tile scheduler.
@@ -8578,7 +9235,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   math_instructions_1sm = [
     MathInstruction(
       [128, 256, 8],
@@ -8659,7 +9316,7 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 100
+  max_cc = 101
   math_instructions_1sm = [
     MathInstruction(
       [128, 256, 16],
@@ -8808,8 +9465,7 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
   ]
 
   min_cc = 100
-  max_cc = 130
-
+  max_cc = 101
   epi_type = DataType.f32
 
   math_instructions_1sm = [
@@ -8913,6 +9569,731 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
 
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
         [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
+# Conv Utility functions
+def make_dims_and_alignments_triple(dim: int, bit_per_element_A: int, bit_per_element_B: int, bit_per_element_C: int):
+  bit_alignment_required_by_tma = 128
+  return ((dim, bit_alignment_required_by_tma // bit_per_element_A), # A
+          (dim, bit_alignment_required_by_tma // bit_per_element_B), # B
+          (dim, bit_alignment_required_by_tma // bit_per_element_C)) # C
+
+def make_math_instruction_w_output(data_types: Tuple[DataType, DataType, DataType, DataType],
+                          instruction_shape: Tuple[int, int, int]) -> (MathInstruction, DataType):
+  default_opcode = OpcodeClass.TensorOp
+  default_math_op = MathOperation.multiply_add
+  [A_data_type, B_data_type, Acc_data_type, Out_data_type] = data_types
+  return (MathInstruction(
+    instruction_shape,
+    A_data_type, B_data_type, Acc_data_type,
+    default_opcode,
+    default_math_op
+  ), Out_data_type)
+
+"""
+Generate CUTLASS 3 convolution kernel(s) for SM100.
+
+This is meant to be called from GenerateSM100.
+"""
+def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version,
+                                           log_indent_level: int = 0):
+  log_debug_line('GenerateSM100_TensorOp_16b_UMMA_conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  minimum_compute_capability = 100
+  maximum_compute_capability = 101
+
+  spatial_dims = [2, 3]
+
+  conv_kinds = [
+    ConvKind.Fprop,
+    ConvKind.Dgrad,
+    ConvKind.Wgrad
+  ]
+
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  data_types_and_instruction_shapes_1sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (64, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (64, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (64, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 256, 16)),
+  ]
+  math_instructions_w_output_1sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_1sm)
+
+  cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
+
+  # tile_descriptions is a 2-level list.
+  # Each inner list is for each cluster shape.
+  for math_inst, output_type in math_instructions_w_output_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      cluster_multiplier = cluster_shape
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      # It's typical to get the data types from the math instruction.
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      for conv_kind in conv_kinds:
+        CreateConvOperator3x(manifest,
+                            dims_and_alignments = dims_and_alignments,
+                            tile_descriptions = tile_descriptions,
+                            data_types = data_type,
+                            schedule_pairs = schedule_pairs,
+                            conv_kind = conv_kind,
+                            log_indent_level = log_indent_level)
+
+  data_types_and_instruction_shapes_2sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (256, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (256, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (256, 256, 16)),
+  ]
+  math_instructions_w_output_2sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_2sm)
+
+  cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+
+  for math_inst, output_type in math_instructions_w_output_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      cluster_multiplier = (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      # It's typical to get the data types from the math instruction.
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      for conv_kind in conv_kinds:
+        CreateConvOperator3x(manifest,
+                            dims_and_alignments = dims_and_alignments,
+                            tile_descriptions = tile_descriptions,
+                            data_types = data_type,
+                            schedule_pairs = schedule_pairs,
+                            conv_kind = conv_kind,
+                            log_indent_level = log_indent_level)
+
+def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version,
+                                           log_indent_level: int = 0):
+  # Instantiate Fp8 Fprop kernels with e4m3 A/B, f32 Acc, e4m3/bf16/f16/f32 C/D
+  log_debug_line('GenerateSM100_TensorOp_fp8_UMMA_conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  minimum_compute_capability = 100
+  maximum_compute_capability = 101
+
+  spatial_dims = [2, 3]
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  data_types_and_instruction_shapes_1sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 256, 32)),
+  ]
+  math_instructions_w_output_1sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_1sm)
+
+  cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
+
+  for math_inst, output_type in math_instructions_w_output_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      cluster_multiplier = cluster_shape
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      CreateConvOperator3x(manifest,
+                          dims_and_alignments = dims_and_alignments,
+                          tile_descriptions = tile_descriptions,
+                          data_types = data_type,
+                          schedule_pairs = schedule_pairs,
+                          conv_kind = ConvKind.Fprop,
+                          log_indent_level = log_indent_level)
+
+  data_types_and_instruction_shapes_2sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (256, 256, 32)),
+  ]
+  math_instructions_w_output_2sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_2sm)
+
+  cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+
+  for math_inst, output_type in math_instructions_w_output_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      cluster_multiplier = (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      CreateConvOperator3x(manifest,
+                          dims_and_alignments = dims_and_alignments,
+                          tile_descriptions = tile_descriptions,
+                          data_types = data_type,
+                          schedule_pairs = schedule_pairs,
+                          conv_kind = ConvKind.Fprop,
+                          log_indent_level = log_indent_level)
+
+def GenerateSM120_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+  # SM120 MMA with mixed F4/F6/F8 inputs + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  layouts = [
+    [[LayoutType.RowMajor,    128], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor,    0]]
+  ]
+
+  instruction_sizes = [
+    [16, 8, 32]
+  ]
+
+  tile_sizes = [
+    [128, 128, 128]
+  ]
+
+  cluster_shape = [1,1,1]
+
+  ab_types  = [
+    DataType.e2m1, 
+    DataType.e2m3, 
+    DataType.e3m2,
+    DataType.e5m2,
+    DataType.e4m3,
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  def is_pingpong(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+    
+  def tile_schedulers(sfdtype, kernel_schedule):
+    # Pingpong kernel schedule doesn't support stream-K.
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K
+    if is_pingpong(kernel_schedule):
+      return [TileSchedulerType.Default]
+    elif sfdtype["type"] == DataType.void:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  min_cc = 120
+  max_cc = 120
+
+  epi_type = DataType.f32
+  
+  math_instructions = []
+
+  kernel_schedules = [
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120
+  ]
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes, ab_types, ab_types, acc_types):
+    math_instructions.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        DataType.ue8m0)
+    )
+
+  for math_inst in math_instructions:
+    tile_descriptions = []
+    for tile_size in tile_sizes:
+      tile_descriptions.append(
+        TileDescription(tile_size, 0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e3m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type, kernel_schedule in product(data_types, kernel_schedules):
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+        tile_schedulers = tile_schedulers(data_type["sfd_type"], kernel_schedule),
+        gemm_kind = GemmKind.BlockScaledUniversal3x
+        )
+
+def GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+  # SM120 MMA with with F4 + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor,    0]]
+  ]
+
+  instruction_sizes = [
+    [16, 8, 64]
+  ]
+
+  tile_sizes_cooperative = [
+    [128, 128, 128],
+    [128, 128, 256]
+  ]
+
+  tile_sizes_pingpong = [
+    [128, 128, 128],
+    [128, 128, 256]
+  ]
+
+  cluster_shape = [1,1,1]
+
+  ab_types  = [
+    DataType.e2m1
+  ]
+
+  sf_types  = [
+    DataType.ue4m3,
+    DataType.ue8m0
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  def is_pingpong(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120 or \
+       kernel_schedule == KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+  
+  def is_nvf4(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120 or \
+       kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+    
+  def tile_schedulers(sfdtype, kernel_schedule):
+    # Pingpong kernel schedule doesn't support stream-K.
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K
+    if is_pingpong(kernel_schedule):
+      return [TileSchedulerType.Default]
+    elif sfdtype["type"] == DataType.void:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  min_cc = 120
+  max_cc = 120
+
+  epi_type = DataType.f32
+  
+  math_instructions = []
+
+  kernel_schedules = [
+    KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120,
+    KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120
+  ]
+
+  for instr_size, a_type, b_type, acc_type, sf_type in product(instruction_sizes, ab_types, ab_types, acc_types, sf_types):
+    math_instructions.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        sf_type)
+    )
+
+  for math_inst in math_instructions:
+    for kernel_schedule in kernel_schedules:
+      tile_descriptions = []
+      tile_sizes = tile_sizes_pingpong if is_pingpong(kernel_schedule) else tile_sizes_cooperative
+      for tile_size in tile_sizes:
+        # nvf4 kernel only supports ue4m3 SF
+        # mxf4 kernel only supports ue8m0 SF
+        if (math_inst.element_scale_factor == DataType.ue4m3 and is_nvf4(kernel_schedule)) or \
+           (math_inst.element_scale_factor == DataType.ue8m0 and not is_nvf4(kernel_schedule)):
+          tile_descriptions.append(
+            TileDescription(tile_size, 0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+      data_types = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.f32,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e5m2,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e5m2,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+        }
+      ]
+
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+      for data_type in data_types:
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+          [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+          tile_schedulers = tile_schedulers(data_type["sfd_type"], kernel_schedule),
+          gemm_kind = GemmKind.BlockScaledUniversal3x
+          ) 
+
+def GenerateSM120_Sparse_TensorOp_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  layouts = [
+    [[LayoutType.RowMajor, 256], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor, 0]]
+  ]
+
+  tile_sizes = [
+    [128, 128, 256]
+  ]
+
+  cluster_shape = [1,1,1]
+  
+  warp_count = [4, 2, 1]
+
+  acc_types = [ DataType.f32 ]
+
+  instruction_sizes_mxf8f6f4 = [
+    [16, 8, 64]
+  ]
+
+  ab_types_mxf8f6f4  = [
+    DataType.e2m1, 
+    DataType.e2m3, 
+    DataType.e3m2,
+    DataType.e5m2,
+    DataType.e4m3,
+  ]
+
+  def tile_schedulers(kernel_schedule):
+      return [TileSchedulerType.Default]
+
+  min_cc = 120
+  max_cc = 120
+
+  kernel_schedules = [
+    KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120,
+  ]
+
+  math_instructions_mxf8f6f4 = []
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes_mxf8f6f4, ab_types_mxf8f6f4, ab_types_mxf8f6f4, acc_types):
+    math_instructions_mxf8f6f4.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.SparseTensorOp,
+        MathOperation.multiply_add)
+    )
+
+  # Create gemm operator for mxf8f6f4
+  for math_inst in math_instructions_mxf8f6f4:
+    tile_descriptions_mxf8f6f4 = []
+    for tile_size in tile_sizes:
+      tile_descriptions_mxf8f6f4.append(
+        TileDescription(tile_size, 0, warp_count, math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      }
+    ]
+
+    for data_type, kernel_schedule in product(data_types, kernel_schedules):
+      # Set alignment d based on Destination format
+      for layout in layouts:
+        layout[2][1] = int(128 // DataTypeSize[data_type["d_type"]])
+      # Create gemm operator
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_mxf8f6f4, data_type,
+        [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+        tile_schedulers = tile_schedulers(kernel_schedule),
+        gemm_kind = GemmKind.SparseUniversal3x)
+
 
 def GenerateSM100(manifest, cuda_version):
   #
@@ -8936,12 +10317,40 @@ def GenerateSM100(manifest, cuda_version):
 
   # StreamK is included in regular generation
   GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
+  #
+  # Sparse Gemm
+  #
+  GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
+
   #
   # Block Scaled Gemm
   #
   GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
   GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
   GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version,  gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
+  #
+  # Conv
+  #
+  GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version)
+
+
+def GenerateSM120(manifest, cuda_version):
+  # StreamK is included in regular generation #
+  #
+  # Dense Block Scaled Gemm
+  #
+  GenerateSM120_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+
+  #
+  # Sparse Gemm
+  #
+  GenerateSM120_Sparse_TensorOp_gemm(manifest, cuda_version)
 
 ###################################################################################################
 
@@ -8963,19 +10372,6 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
 
   spatial_dims = (2, 3)
 
-  # This function only generates kernels that use TMA.
-  byte_alignment_required_by_tma = 16
-  tma_byte_alignments = {
-    'A': byte_alignment_required_by_tma,
-    'B': byte_alignment_required_by_tma,
-    'C': byte_alignment_required_by_tma,
-  }
-
-  # For tuples of one element, the element needs to end with comma.
-  all_byte_alignments = (
-    tma_byte_alignments,
-  )
-
   # MMA shapes (MMA_M, MMA_N, MMA_K):
   #
   # Different hardware MMA instructions may have different MMA shapes.
@@ -9025,6 +10421,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp32, # ElementOut (used only by CollectiveEpilogue)
     'acc_type': fp16, # ElementAcc
     'epi_type': fp32, # ElementCompute (used only by CollectiveEpilogue)
+    'alignment_A': 8, # tma alignment elements of A
+    'alignment_B': 8, # tma alignment elements of B
+    'alignment_C': 4, # tma alignment elements of C
   }
   fp16_fp32_fp32_fp32 = {
     'a_type':   fp16,
@@ -9033,6 +10432,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp32,
     'acc_type': fp32,
     'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 4,
   }
   fp32_fp32_fp32_fp32 = {
     'a_type':   fp32,
@@ -9041,6 +10443,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp32,
     'acc_type': fp32,
     'epi_type': fp32,
+    'alignment_A': 4,
+    'alignment_B': 4,
+    'alignment_C': 4,
   }
   s8_s32_s32_s32 = {
     'a_type':     s8,
@@ -9049,6 +10454,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':    s32,
     'acc_type':  s32,
     'epi_type':  s32,
+    'alignment_A': 16,
+    'alignment_B': 16,
+    'alignment_C': 4,
   }
 
   # Other NVIDIA libraries may have the habit of specifying data types like this.
@@ -9059,6 +10467,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp32,
     'acc_type': fp32,
     'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 4,
   }
   f16f16_f16f16_f16 = {
     'a_type':   fp16,
@@ -9067,6 +10478,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp16,
     'acc_type': fp16,
     'epi_type': fp16,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 8,
   }
   f16f16_f16f32_f32 = {
     'a_type':   fp16,
@@ -9075,6 +10489,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':   fp16,
     'acc_type': fp32,
     'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 8,
   }
   f32f32_tf32f32_f32 = fp32_fp32_fp32_fp32
 
@@ -9085,6 +10502,9 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
     'd_type':    s32,
     'acc_type':  s32,
     'epi_type':  s32,
+    'alignment_A': 16,
+    'alignment_B': 16,
+    'alignment_C': 4,
   }
 
   # Each element in the outermost iterable is one combination of
@@ -9115,7 +10535,6 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
         fp16_fp32_fp32_fp32,
         s8_s32_s32_s32,
       ),
-      all_byte_alignments,
       (
         mma_64x64x16,
       ),
@@ -9129,7 +10548,6 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
       (
         fp32_fp32_fp32_fp32,
       ),
-      all_byte_alignments,
       (
         mma_64x64x8,
       ),
@@ -9145,7 +10563,6 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
         fp16_fp32_fp16_fp32,
         fp16_fp32_fp32_fp32,
       ),
-      all_byte_alignments,
       (
         mma_64x64x16,
       ),
@@ -9163,60 +10580,60 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (256, 128, 16), (2, 1, 1)),
       #
       # f16f16_f16f16_f16
       #
       # cluster shape (1, 1, 1)
       #
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64, 16), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128,  8), (1, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256, 128, 16), (1, 1, 1)),
       #
       # f16f16_f16f32_f32
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 192,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 192, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256,  96,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256,  96, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 192, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256,  96,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256,  96, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256, 128, 16), (2, 1, 1)),
       #
       # f32f32_tf32f32_f32
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (128, 192,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
-      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (256,  96,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (256,  96,  8), (2, 1, 1)),
       #
       # i8i8_i8i32_f32
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (128, 256, 32), (2, 1, 1)),
-      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
-      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (256, 128, 32), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (128, 256, 32), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (256, 128, 32), (2, 1, 1)),
       #
       # Dgrad
       #
@@ -9224,38 +10641,38 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
-      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
-      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
-      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (256, 128, 16), (2, 1, 1)),
       #
       # f16f16_f16f16_f16
       #
       # cluster shape (1, 1, 1)
       #
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64, 16), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128,  8), (1, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256, 128, 16), (1, 1, 1)),
       #
       # f16f16_f16f32_f32
       #
       # cluster shape (2, 1, 1)
       #
-      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
-      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (256, 128, 16), (2, 1, 1)),
     ),
   )
 
@@ -9286,18 +10703,17 @@ def GenerateSM90_Conv3x(manifest, cuda_version,
       default_math_op
     )
 
-  for (conv_kind, spatial_dim, data_types, byte_alignments, mma_shape, cluster_shape) in combinations_of_parameters:
+  for (conv_kind, spatial_dim, data_types, mma_shape, cluster_shape) in combinations_of_parameters:
     math_inst = make_math_instruction(data_types, mma_shape)
     tile_shape = (mma_shape[0], mma_shape[1], num_mma_per_tile * mma_shape[2])
     tile_description = TileDescription(tile_shape, stages, warp_count, math_inst,
       minimum_compute_capability, maximum_compute_capability, cluster_shape)
     assert(isinstance(spatial_dim, int))
-    assert(isinstance(byte_alignments, dict))
     dims_and_alignments = (
       (
-        (spatial_dim, byte_alignments['A']),
-        (spatial_dim, byte_alignments['B']),
-        (spatial_dim, byte_alignments['C']),
+        (spatial_dim, data_types['alignment_A']),
+        (spatial_dim, data_types['alignment_B']),
+        (spatial_dim, data_types['alignment_C']),
       ),
     )
     CreateConvOperator3x(manifest,
@@ -9411,10 +10827,10 @@ if __name__ == "__main__":
   GenerateSM89(manifest, args.cuda_version)
   GenerateSM90(manifest, args.cuda_version)
 
-  
-  blackwell_enabled_arch = args.architectures == "100a"
+  blackwell_enabled_arch = args.architectures in ["100a", "101a", "120a"]
   if blackwell_enabled_arch:
     GenerateSM100(manifest, args.cuda_version)
+    GenerateSM120(manifest, args.cuda_version)
   
 
   if 'library' in args.generator_target.split(','):
diff --git a/python/cutlass_library/library.py b/python/cutlass_library/library.py
index 13a579a8..7e1994c1 100644
--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@@ -246,14 +246,14 @@ DataTypeSize = {
   DataType.s64: 64,
   DataType.e4m3: 8,
   DataType.e5m2: 8,
-  DataType.f8: 8,      
-  DataType.f6: 6,      
-  DataType.f4: 4,      
-  DataType.e2m3: 6,       
-  DataType.e3m2: 6,       
-  DataType.e2m1: 4,       
-  DataType.ue8m0: 8,      
-  DataType.ue4m3: 8,      
+  DataType.f8: 8,
+  DataType.f6: 6,
+  DataType.f4: 4,
+  DataType.e2m3: 6,
+  DataType.e3m2: 6,
+  DataType.e2m1: 4,
+  DataType.ue8m0: 8,
+  DataType.ue4m3: 8,
   DataType.f16: 16,
   DataType.bf16: 16,
   DataType.f32: 32,
@@ -495,6 +495,8 @@ class KernelScheduleType(enum.Enum):
 
   TmaWarpSpecialized1SmSm100 = enum_auto()
   TmaWarpSpecialized2SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized1SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized2SmSm100 = enum_auto()
 
   PtrArrayTmaWarpSpecialized1SmSm100 = enum_auto()
   PtrArrayTmaWarpSpecialized2SmSm100 = enum_auto()
@@ -508,6 +510,9 @@ class KernelScheduleType(enum.Enum):
   PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
   PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
 
+  SparseTmaWarpSpecialized1SmSm100 = enum_auto()
+  SparseTmaWarpSpecialized2SmSm100 = enum_auto()
+
   BlockScaledTmaWarpSpecialized1SmSm100 = enum_auto()
   BlockScaledTmaWarpSpecialized2SmSm100 = enum_auto()
   Mxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
@@ -518,7 +523,15 @@ class KernelScheduleType(enum.Enum):
   Nvf4TmaWarpSpecialized1SmSm100 = enum_auto()
   Nvf4TmaWarpSpecialized2SmSm100 = enum_auto()
 
-#
+  Mxf8f6f4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf8f6f4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+
+  F8f6f4SparseTmaWarpSpecializedCooperativeSm120 = enum_auto()
+
 KernelScheduleTag = {
   KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
   KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
@@ -537,9 +550,15 @@ KernelScheduleTag = {
   KernelScheduleType.TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmSm100',
   KernelScheduleType.TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmSm100',
 
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100',
+
   KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100',
   KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100',
 
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100',
+
   KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100',
   KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100',
@@ -563,6 +582,15 @@ KernelScheduleTag = {
   KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100",
   KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100",
   KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100",
+
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongNvf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf4Sm120',
+
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelScheduleSparseF8f6f4Sm120'
 }
 
 #
@@ -584,9 +612,15 @@ KernelScheduleSuffixes = {
   KernelScheduleType.TmaWarpSpecialized1SmSm100: '_1sm',
   KernelScheduleType.TmaWarpSpecialized2SmSm100: '_2sm',
 
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: '_2sm',
+
   KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: '_1sm',
   KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: '_2sm',
 
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: '_2sm',
+
   KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: '_1sm',
   KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: '_2sm',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: '_q_1sm',
@@ -610,6 +644,15 @@ KernelScheduleSuffixes = {
   KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
   KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
   KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: '_cooperative_q',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: '_pingpong_q',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs16',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs16',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs32',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs32',
+
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: '_q'
 }
 
 class EpilogueScheduleType(enum.Enum):
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
index ad81db6d..38d0f764 100644
--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@@ -522,7 +522,9 @@ class Manifest:
 
       arch_conditional_cc = [
         '90a', 
-        '100a' 
+        '100a',
+        '101a',
+        '120a' 
       ]
       architectures = [x if x not in arch_conditional_cc else x.split('a')[0] for x in architectures]
 
diff --git a/python/docs_src/source/contribute.md b/python/docs_src/source/contribute.md
index 42475252..29ea9b63 100644
--- a/python/docs_src/source/contribute.md
+++ b/python/docs_src/source/contribute.md
@@ -7,3 +7,35 @@ Thank you for your interest in contributing to the CUTLASS Python interface. Bas
     - The CUTLASS team will evaluate the issues and triage them, scheduling them for a release. If you believe the issue needs priority attention, comment on the issue to notify the team.
 2. You want to implement a feature or bug-fix
     - We welcome contributions from the community. We recommend that you contribute via a [pull request](https://github.com/NVIDIA/cutlass/pulls). If you have questions about CUTLASS, consider asking a question via the [Discussions](https://github.com/NVIDIA/cutlass/discussions) tab. Please be sure to search through both existing issues and discussions to see whether your question has already been answered.
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/python/docs_src/source/install.md b/python/docs_src/source/install.md
index e0513fe1..604710c9 100644
--- a/python/docs_src/source/install.md
+++ b/python/docs_src/source/install.md
@@ -38,3 +38,35 @@ We recommend using the CUTLASS Python interface via an [NGC PyTorch Docker conta
 ```bash
 docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:23.08-py3
 ```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/python/pycute/int_tuple.py b/python/pycute/int_tuple.py
index 36abf557..3d722130 100644
--- a/python/pycute/int_tuple.py
+++ b/python/pycute/int_tuple.py
@@ -117,12 +117,7 @@ def shape_div(a, b):
       return shape_div(a, product(b))
     else:                              # "int" "int"
       assert a % b == 0 or b % a == 0
-      #return -(-a // b)      # Python exclusive impl: "//" is always floor div
-      if a % b == 0:
-        return a // b
-      else:
-        return signum(a*b)
-
+      return (a + b - 1) // b
 
 # Exclusive prefix product with output congruent to input a
 def prefix_product(a, init=1):
diff --git a/python/pycute/layout.py b/python/pycute/layout.py
index 389f0037..7c220eb1 100644
--- a/python/pycute/layout.py
+++ b/python/pycute/layout.py
@@ -204,19 +204,28 @@ def composition(layoutA, layoutB):
   else:
     result_shape  = []
     result_stride = []
-    rest_shape   = layoutB.shape
-    rest_stride  = layoutB.stride
-    for (s, d) in zip(flatten(layoutA.shape)[:-1], flatten(layoutA.stride)[:-1]):
-      s1 = shape_div(s, rest_stride)
-      result_shape.append(min(s1,rest_shape))
-      result_stride.append(rest_stride * d)
-      rest_shape  = shape_div(rest_shape, abs(s1))
-      rest_stride = shape_div(rest_stride, s)
+    rest_shape    = layoutB.shape
+    rest_stride   = layoutB.stride
+    flat_A = coalesce(layoutA)
+    for (curr_shape, curr_stride) in zip(flatten(flat_A.shape)[:-1], flatten(flat_A.stride)[:-1]):
+      assert curr_shape % rest_stride == 0 or rest_stride % curr_shape == 0
+      new_shape = min(max(1, curr_shape // rest_stride), rest_shape)
 
-    result_shape.append(rest_shape)
-    result_stride.append(rest_stride * flatten(layoutA.stride)[-1])
+      if new_shape != 1:
+        result_shape.append(new_shape)
+        result_stride.append(rest_stride * curr_stride)
 
-    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+      rest_shape  = rest_shape // new_shape
+      rest_stride = -(-rest_stride // curr_shape)  # Python exclusive impl: "//" is always floor div so == ceil_div(abs(rest_stride), curr_shape) * signum(rest_stride)
+
+    if rest_shape != 1 or len(result_shape) == 0:
+      result_shape.append(rest_shape)
+      result_stride.append(rest_stride * flatten(flat_A.stride)[-1])
+
+    if len(result_shape) == 1:
+      return Layout(result_shape[0], result_stride[0])
+    else:
+      return Layout(tuple(result_shape), tuple(result_stride))
 
 
 # Layout complement
diff --git a/test/python/pycute/test_composition.py b/test/python/pycute/test_composition.py
index 1918972d..6c27eb7f 100644
--- a/test/python/pycute/test_composition.py
+++ b/test/python/pycute/test_composition.py
@@ -199,6 +199,15 @@ class TestComposition(unittest.TestCase):
     layoutB = Layout((4,2,2), (2,8,1))
     self.helper_test_composition(layoutA, layoutB)
 
+    # Pre-coalesced LHS
+    layoutA = Layout((4,6,8),(1,4,7))
+    layoutB = Layout((6),(1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    # Mid-layout truncation
+    layoutA = Layout((4,6,8,10),(2,3,5,7))
+    layoutB = Layout(6,12)
+    self.helper_test_composition(layoutA, layoutB)
 
 if __name__ == "__main__":
   unittest.main()
diff --git a/test/unit/cute/core/complement.cpp b/test/unit/cute/core/complement.cpp
index 94e77f24..905f27e3 100644
--- a/test/unit/cute/core/complement.cpp
+++ b/test/unit/cute/core/complement.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
 #include "cutlass_unit_test.h"
 
 #include <cutlass/trace.h>
@@ -68,6 +70,9 @@ test_complement(Layout const& layout, CoTarget const& cotarget)
   if constexpr (is_static<decltype(stride(completed))>::value) {  // If we can apply complement again
     EXPECT_EQ(size(complement(completed)), 1);                    // There's no more codomain left over
   }
+  if constexpr (is_static<decltype(result)>::value && is_static<decltype(layout)>::value) {
+    EXPECT_TRUE(bool(complement(complement(result,cosize(layout)),cotarget) == result));
+  }
 }
 
 template <class Layout>
diff --git a/test/unit/cute/core/composition.cpp b/test/unit/cute/core/composition.cpp
index 08d4d5b7..e1eda9a6 100644
--- a/test/unit/cute/core/composition.cpp
+++ b/test/unit/cute/core/composition.cpp
@@ -29,6 +29,10 @@
  *
  **************************************************************************************************/
 
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
+#include "cutlass_unit_test.h"
+
 #include <cutlass/trace.h>
 
 #include <cute/layout.hpp>
@@ -38,8 +42,6 @@
 #include <cute/tensor.hpp>
 #include <iostream>
 
-#include "cutlass_unit_test.h"
-
 using namespace cute;
 
 
@@ -273,14 +275,14 @@ TEST(CuTe_core, Composition)
 
   {
     auto a = make_layout(Shape<_8,_8>{});
-    auto b = make_layout(Shape<Shape<_2, _2,_2>, Shape<_2,_2, _2>>{},
+    auto b = make_layout(Shape <Shape <_2, _2,_2>, Shape <_2,_2, _2>>{},
                          Stride<Stride<_1,_16,_4>, Stride<_8,_2,_32>>{});
     test_composition(a, b);
   }
 
   {
     auto a = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
-    auto b = make_layout(Shape<Shape<_2, _2,_2>, Shape<_2,_2, _2>>{},
+    auto b = make_layout(Shape <Shape <_2, _2,_2>, Shape <_2,_2, _2>>{},
                          Stride<Stride<_1,_16,_4>, Stride<_8,_2,_32>>{});
 
     test_composition(a, b);
@@ -423,7 +425,7 @@ TEST(CuTe_core, Composition)
     test_composition(a, b);
   }
 
-  // Capping a Layout with 1:0 forces divisibility and extends in stride-0
+  // Capping a Layout with 1:0 extends in stride-0
   {
     auto a = make_layout(Shape<_4,_3,_1>{}, Stride<_3,_1,_0>{});
     auto b = make_layout(Shape<_24>{});
@@ -431,6 +433,50 @@ TEST(CuTe_core, Composition)
     test_composition(a, b);
   }
 
+  {
+    auto a = make_layout(Shape<_4,_3,_1>{}, Stride<_3,_1,_0>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  // Pre-coalesced LHS
+  {
+    auto a = make_layout(Shape<_4,_6,_8>{}, Stride<_1,_4,_7>{});
+    auto b = make_layout(_6{}, _1{});
+
+    test_composition(a, b);
+  }
+
+  // Mid-layout truncation
+  {
+    auto a = make_layout(Shape<_4,_6,_8,_10>{}, Stride<_2,_3,_5,_7>{});
+    auto b = make_layout(_6{}, _12{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
+    auto b = make_layout(_2{}, _3{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
+    auto b = make_layout(_3{}, _3{});
+
+    test_composition(a, b);
+  }
+
+  // Should fail to a static divisibility condition
+  // {
+  //   auto a = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
+  //   auto b = make_layout(_4{}, _3{});
+  //   test_composition(a, b);
+  // }
+
   {
     auto a = make_layout(3, _1{});
     auto b = make_layout(_4{}, _1{});
diff --git a/test/unit/cute/core/inverse_left.cpp b/test/unit/cute/core/inverse_left.cpp
index e06c3ead..7694844f 100644
--- a/test/unit/cute/core/inverse_left.cpp
+++ b/test/unit/cute/core/inverse_left.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
 #include "cutlass_unit_test.h"
 
 #include <cutlass/trace.h>
@@ -50,18 +52,31 @@ test_left_inverse(Layout const& layout)
   CUTLASS_TRACE_HOST(layout << " ^ -1\n" << "  =>  \n" << inv_layout);
 
   for (int i = 0; i < size(layout); ++i) {
-    //printf("%3d: %3d  %3d\n", i, int(layout(i)), int(inv_layout(layout(i))));
-    EXPECT_EQ(inv_layout(layout(i)), i);
+    //printf("%3d: %3d  %3d  %3d\n", i, int(layout(i)), int(inv_layout(layout(i))), int(layout(inv_layout(layout(i)))));
+    EXPECT_EQ(layout(inv_layout(layout(i))), layout(i));
   }
 
-  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(inv_layout, layout)));
+  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(layout, composition(inv_layout, layout))));
 }
 
 TEST(CuTe_core, Inverse_left)
 {
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("LEFT INVERSE"                   );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Simple tests"                   );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
   {
-  auto layout = Layout<Shape <_1>,
-                       Stride<_0>>{};
+  auto layout = Layout<_1, _0>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<_1, _1>{};
 
   test_left_inverse(layout);
   }
@@ -74,8 +89,15 @@ TEST(CuTe_core, Inverse_left)
   }
 
   {
-  auto layout = Layout<Shape <_1>,
-                       Stride<_1>>{};
+  auto layout = Layout<Shape <Shape <_3,_7>>,
+                       Stride<Stride<_0,_0>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_0>>{};
 
   test_left_inverse(layout);
   }
@@ -94,6 +116,13 @@ TEST(CuTe_core, Inverse_left)
   test_left_inverse(layout);
   }
 
+  {
+  auto layout = Layout<Shape <_2,_4>,
+                       Stride<_0,_2>>{};
+
+  test_left_inverse(layout);
+  }
+
   {
   auto layout = Layout<Shape <_8, _4>>{};
 
@@ -120,6 +149,13 @@ TEST(CuTe_core, Inverse_left)
   test_left_inverse(layout);
   }
 
+  {
+  auto layout = Layout<Shape <_2,_4,_4,_6>,
+                       Stride<_4,_1,_0,_8>>{};
+
+  test_left_inverse(layout);
+  }
+
   {
   auto layout = Layout<Shape <_4, _2>,
                        Stride<_1,_16>>{};
@@ -127,9 +163,105 @@ TEST(CuTe_core, Inverse_left)
   test_left_inverse(layout);
   }
 
-  //
-  // Swizzle left_inverse
-  //
+  {
+  auto layout = Layout<Shape <_4, _2>,
+                       Stride<_1, _5>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128,_128>,Stride<_65536,_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128,_160>,Stride<_65536,_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128,_3,_160>,Stride<_65536,_512,_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128, _64>, Stride<Int<131072>, Int<2>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_32,_4,_4,_4>, Stride<_262144,_4,Int<8388608>,_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_2,_2>, Stride<_4,_0,_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <Shape <Shape <Shape <Shape <      _32, _4>, _1>, Shape < _32,   _2>>,         _4>, _1, Shape <_2,  _2>,  _2>,
+                       Stride<Stride<Stride<Stride<Stride<C<262144>, _4>, _0>, Stride<C<0>, C<1>>>, C<8388608>>, _0, Stride<_2, _16>, _32>>{};
+
+  test_left_inverse(layout);
+  }
+
+  // CUTLASS_TRACE_HOST("-------------------------------");
+  // CUTLASS_TRACE_HOST("Dynamic shapes/strides"         );
+  // CUTLASS_TRACE_HOST("-------------------------------");
+
+  // {
+  // auto layout = make_layout(Shape<_4, _2>{}, make_stride(Int<1>{}, 4));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(make_shape(_4{}, 2), make_stride(Int<1>{}, 4));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(make_shape(4, 2), make_stride(Int<1>{}, 4));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(Shape<_2, _4>{}, make_stride(4, Int<1>{}));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(make_shape(2, Int<4>{}), make_stride(4, Int<1>{}));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(make_shape(2, 4), make_stride(4, Int<1>{}));
+
+  // test_left_inverse(layout);
+  // }
+
+  // {
+  // auto layout = make_layout(make_shape(2, 4), make_stride(4, 1));
+
+  // test_left_inverse(layout);
+  // }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Swizzle layouts"                );
+  CUTLASS_TRACE_HOST("-------------------------------");
 
   {
   auto layout = ComposedLayout<Swizzle<1,0,2>, _0, Layout<Shape <_4, _4>,
@@ -152,10 +284,10 @@ TEST(CuTe_core, Inverse_left)
   test_left_inverse(layout);
   }
 
-  //
-  // Negative strides (beta support)
-  // Post-conditions/layout indexing aren't generalized enough to support these yet
-  // However, the composition post-condition is general enough.
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("BETA: Negative strides"         );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
   {
   auto layout = make_layout(Shape<_4>{}, Stride<Int<-1>>{});
 
diff --git a/test/unit/cute/core/inverse_right.cpp b/test/unit/cute/core/inverse_right.cpp
index 8edb0ca4..f9030009 100644
--- a/test/unit/cute/core/inverse_right.cpp
+++ b/test/unit/cute/core/inverse_right.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
 #include "cutlass_unit_test.h"
 
 #include <cutlass/trace.h>
@@ -41,16 +43,6 @@
 
 using namespace cute;
 
-template <class Layout, class InvLayout>
-void
-test_postconditions(Layout const& layout, InvLayout const& inv_layout)
-{
-  for (int i = 0; i < size(inv_layout); ++i) {
-    //printf("%3d: %3d  %3d\n", i, int(inv_layout(i)), int(layout(inv_layout(i))));
-    EXPECT_EQ(layout(inv_layout(i)),  i);
-  }
-}
-
 template <class Layout>
 void
 test_right_inverse(Layout const& layout)
@@ -58,9 +50,13 @@ test_right_inverse(Layout const& layout)
   auto inv_layout = right_inverse(layout);
 
   CUTLASS_TRACE_HOST(layout << " ^ -1\n" << "  =>  \n" << inv_layout);
-  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(layout, inv_layout)) << std::endl);
 
-  test_postconditions(layout, inv_layout);
+  for (int i = 0; i < size(inv_layout); ++i) {
+    //printf("%3d: %3d  %3d\n", i, int(inv_layout(i)), int(layout(inv_layout(i))));
+    EXPECT_EQ(layout(inv_layout(i)),  i);
+  }
+
+  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(layout, inv_layout)) << std::endl);
 }
 
 TEST(CuTe_core, Inverse_right)
@@ -85,13 +81,6 @@ TEST(CuTe_core, Inverse_right)
   test_right_inverse(layout);
   }
 
-  {
-  auto layout = Layout<Shape <_4>,
-                       Stride<_0>>{};
-
-  test_right_inverse(layout);
-  }
-
   {
   auto layout = Layout<Shape <Shape <_1,_1>>,
                        Stride<Stride<_0,_0>>>{};
@@ -107,8 +96,8 @@ TEST(CuTe_core, Inverse_right)
   }
 
   {
-  auto layout = Layout<Shape <_1>,
-                       Stride<_1>>{};
+  auto layout = Layout<Shape <_4>,
+                       Stride<_0>>{};
 
   test_right_inverse(layout);
   }
@@ -181,6 +170,49 @@ TEST(CuTe_core, Inverse_right)
   test_right_inverse(layout);
   }
 
+  {
+  auto layout = Layout<Shape<_128,_128>,Stride<_65536,_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128,_160>,Stride<_65536,_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128,_3,_160>,Stride<_65536,_512,_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_128, _64>, Stride<Int<131072>, Int<2>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_32,_4,_4,_4>, Stride<_262144,_4,Int<8388608>,_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_2,_2>, Stride<_4,_0,_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <Shape <Shape <Shape <Shape <      _32, _4>, _1>, Shape < _32,   _2>>,         _4>, _1, Shape <_2,  _2>,  _2>,
+                       Stride<Stride<Stride<Stride<Stride<C<262144>, _4>, _0>, Stride<C<0>, C<1>>>, C<8388608>>, _0, Stride<_2, _16>, _32>>{};
+
+  test_right_inverse(layout);
+  }
+
   CUTLASS_TRACE_HOST("-------------------------------");
   CUTLASS_TRACE_HOST("Dynamic shapes/strides"         );
   CUTLASS_TRACE_HOST("-------------------------------");
@@ -203,6 +235,24 @@ TEST(CuTe_core, Inverse_right)
   test_right_inverse(layout);
   }
 
+  {
+  auto layout = make_layout(Shape<_2, _4>{}, make_stride(4, Int<1>{}));
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(2, Int<4>{}), make_stride(4, Int<1>{}));
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(2, 4), make_stride(4, Int<1>{}));
+
+  test_right_inverse(layout);
+  }
+
   CUTLASS_TRACE_HOST("-------------------------------");
   CUTLASS_TRACE_HOST("Swizzle layouts"                );
   CUTLASS_TRACE_HOST("-------------------------------");
diff --git a/test/unit/cute/core/logical_divide.cpp b/test/unit/cute/core/logical_divide.cpp
index bfeba1d5..87eadafb 100644
--- a/test/unit/cute/core/logical_divide.cpp
+++ b/test/unit/cute/core/logical_divide.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
 #include "cutlass_unit_test.h"
 
 #include <cutlass/trace.h>
diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
index dfd9831f..4d2620e6 100644
--- a/test/unit/cute/hopper/tma_load_testbed.hpp
+++ b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -131,7 +131,7 @@ tma_test_device_cute(T const* g_in, T* g_out,
   for (int stage = 0; stage < size<1>(tAgA); ++stage)
   {
     // Set the bytes transferred in this TMA transaction (may involve multiple issues)
-    constexpr int kTmaTransactionBytes = sizeof(ArrayEngine<T, CUTE_STATIC_V(size(filter_zeros(sA)))>);
+    constexpr int kTmaTransactionBytes = sizeof(make_tensor_like(tensor<0>(tAsA)));
 
     if (threadIdx.x == 0)
     {
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 9a6a76fc..8d30e790 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -53,6 +53,13 @@ endfunction()
 
 add_subdirectory(sm100_blockscaled_tensorop_gemm)
 add_subdirectory(sm100_tensorop_gemm)
+add_subdirectory(sm100_blockscaled_sparse_tensorop_gemm)
+add_subdirectory(sm100_sparse_tensorop_gemm)
+add_subdirectory(sm120_blockscaled_sparse_tensorop_gemm)
+add_subdirectory(sm120_sparse_tensorop_gemm)
+add_subdirectory(sm120_tensorop_gemm)
+add_subdirectory(sm120_blockscaled_tensorop_gemm)
+
 cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_simt
 
@@ -330,6 +337,11 @@ cutlass_test_unit_gemm_device_add_executable(
 cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm90_group_gemm
   sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+)
+
+# Group Gemm pingpong test
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm90_group_gemm_pingpong
   sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
 )
 
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
index ea69b394..d8482444 100644
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -892,6 +892,83 @@ struct HostCollectiveMainloop<ScheduleType_, Gemm, ElementA_, ElementB_,
   using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
 };
 
+//
+// Sparse MMA input Operands : A_compressed, B, metadata
+//
+// Structured Sparse Gemm Input Operands
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  typename ElementA_,
+  typename ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<SchedulerPipelineStageCount_,
+                                                                                 AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_>
+  : HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>
+{
+  using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
+};
+
+//
+// Sparse Gemm Input Operands : A , B, E
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, false /*isAsymmetric*/>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0/*SchedulerPipelineStageCount_*/,
+                                                                                 0/*AccumulatorPipelineStageCount_*/>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_ >;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+//
+// Sparse Gemm Input Operands : A , B, E
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, true /*isAsymmetric*/>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0/*SchedulerPipelineStageCount_*/,
+                                                                                 0/*AccumulatorPipelineStageCount_*/>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_ >;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
 
 //
 // Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
@@ -923,10 +1000,10 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaled
   static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
 
   using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
-  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
-  using Blk_MN   = typename Sm100BlkScaledConfig::Blk_MN;
-  using Blk_SF   = typename Sm100BlkScaledConfig::Blk_SF;
-  using SfAtom   = typename Sm100BlkScaledConfig::SfAtom;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
   using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
   using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
 
@@ -1015,8 +1092,8 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaled
     auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
     auto m_blks = cutlass::ceil_div(M, Blk_MN{});
     auto n_blks = cutlass::ceil_div(N, Blk_MN{});
-    layout_sfa = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
-    layout_sfb = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    layout_sfa = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    layout_sfb = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
 
     // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
     auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{} * L, k_blks * Blk_SF{});
@@ -1098,6 +1175,413 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaled
 };
 
 
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>, 
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Structured Sparse Gemm Input Operands : A_compressed, B, metadata, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  typename ElementA_,
+  typename ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,
+                                                                                            AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> {
+  // Kernel data types
+  using ElementA = ElementA_;
+  // CuTe layout A for the kernel's sparse tensorA.
+  using LayoutA  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+
+  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+  // CuTe layout E for the kernel's metadata tensor.
+  using LayoutE  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // The following typenames are for the reference host tensors. They are non-sparse tensors.
+  using LayoutTagA = decltype(SparseConfig::deduce_layoutA_tag(LayoutA{}));
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  // We don't care about the actual strideE for the host tensor, but just need one to allocate memory.
+  using StrideE = StrideA;
+
+  static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  using LayoutTagE = cutlass::detail::StrideToLayoutTagA_t<StrideE>;
+
+  using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig>;
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                        cute::Shape<int, int, int, int>,
+                        ElementA,
+                        LayoutTagA,
+                        SparseConfig,
+                        cutlass::arch::Sm100>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  StrideA stride_a;
+  StrideA stride_a_compressed;
+  StrideB stride_b;
+  StrideE stride_e;
+
+  LayoutA layout_a;
+  LayoutE layout_e;
+  LayoutSFA layout_sfa;
+  LayoutSFB layout_sfb;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+  typename LayoutTagE::Stride stride_factor_E;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A_Comp;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  cutlass::HostTensor<ElementE, LayoutTagE> tensor_E;
+  cutlass::HostTensor<ElementSF, LayoutTagA> tensor_SFA;
+  cutlass::HostTensor<ElementSF, LayoutTagB> tensor_SFB;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride(),
+    typename LayoutTagE::Stride stride_factor_E_ = typename LayoutTagE::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    stride_factor_E(stride_factor_E_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop (KernelSparseTmaWarpSpecializedBlockScaledSm100)::initialize");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    int KAlignedE = compressor_utility.get_metadata_k_physical();
+    int MAlignedE = compressor_utility.get_metadata_m_physical();
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    int KAlignedAC = compressor_utility.get_tensorA_k_physical();
+    int MAlignedAC = compressor_utility.get_tensorA_m_physical();
+
+    stride_a_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KAlignedAC, L));
+    stride_e = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(MAlignedE, KAlignedE, L));
+
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    auto b_coord = cutlass::make_Coord(K, N * L);
+    auto e_coord = cutlass::make_Coord(MAlignedE * L, KAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(MAlignedAC * L, KAlignedAC);
+
+    tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+    tensor_A_Comp.resize(a_comp_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_comp_coord, stride_factor_A));
+    tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    tensor_E.resize(e_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagE>::layout_factory(e_coord, stride_factor_E));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+    compressor_utility.structure_sparse_zero_mask_fill(tensor_A.host_data(), static_cast<int>(seed + 2023));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_E.sync_device();
+    tensor_A_Comp.sync_device();
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+      {M, N, K, L},
+      {tensor_A.device_data(),
+       stride_a,
+       tensor_A_Comp.device_data(),
+       tensor_E.device_data()},
+      {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.run();
+
+    auto result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    layout_a = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    layout_e = SparseConfig::fill_layoutE(problem_shape_MNKL);
+
+    tensor_E.sync_host();
+    tensor_A_Comp.sync_host();
+
+    using namespace cute;
+    auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
+    auto m_blks = cutlass::ceil_div(M, Blk_MN{});
+    auto n_blks = cutlass::ceil_div(N, Blk_MN{});
+    layout_sfa = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    layout_sfb = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+    auto sfb_coord   = cutlass::make_Coord(n_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+ 
+    tensor_SFA.resize(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A));
+    tensor_SFB.resize(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B));
+
+    EXPECT_TRUE(initialize_tensor(tensor_SFA.host_view(), init_A, seed + 2024));
+    EXPECT_TRUE(initialize_tensor(tensor_SFB.host_view(), init_B, seed + 2025));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_SFA.host_view().at({0, 0}) = ElementSF(1);
+    tensor_SFB.host_view().at({0, 0}) = ElementSF(1);
+
+    tensor_SFA.sync_device();
+    tensor_SFB.sync_device();
+
+    return true;
+  }
+
+  Arguments to_args() {
+    using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+    using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+    return {
+      reinterpret_cast<ArrayElementA *>(tensor_A_Comp.device_data()), layout_a,
+      reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b,
+      tensor_E.device_data(), layout_e,
+      tensor_SFA.device_data(), layout_sfa,
+      tensor_SFB.device_data(), layout_sfb
+    };
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(tensor_SFA.host_data(), layout_sfa);
+
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(tensor_SFB.host_data(), layout_sfb);
+
+    // return {A, SfA, B, SfB};
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, 
+        decltype(A),  
+        decltype(B), 
+        decltype(SfA), 
+        decltype(SfB)
+      > 
+          mainloop_params{A, SfA, B, SfB};
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view()
+         << "\nSFA =\n" << tensor_SFA.host_view()
+         << "\nSFB =\n" << tensor_SFB.host_view();
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFA.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFB.host_view()), 0);
+    return true;
+  }
+};
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, true>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, false>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, 
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
 template<class Gemm>
 struct HostCollectiveDefaultEpilogue {
   // fusion types are potentially void if the fusion is not supported
@@ -1391,13 +1875,13 @@ struct HostCollectiveEpilogue {
   static constexpr bool IsBlockScaleSupported            = FusionOp::IsBlockScaleSupported;
   static constexpr SfStrategy SfGenStrategy              = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
   static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
+  static constexpr bool IsKMajorSFD = cute::is_same_v<typename FusionOp::GmemLayoutTagScalefactor, cutlass::layout::RowMajor>;
   using ElementSFD = non_void_t<typename FusionOp::ElementBlockScaleFactor, ElementD>;
-  using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
-                                          SFD_VectorSize
-                                        >;
-  using Blk_MN = typename Sm100BlockScaledOutputConfig::Blk_MN;
-  using Blk_SF = typename Sm100BlockScaledOutputConfig::Blk_SF; 
-  using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
+  using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFD_VectorSize,
+                                        IsKMajorSFD ? cute::UMMA::Major::K : cute::UMMA::Major::MN>;
+  using Blk_MN = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
+  using Blk_SF = typename Sm1xxBlockScaledOutputConfig::Blk_SF; 
+  using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
   cutlass::HostTensor<ElementSFD, LayoutTagD> tensor_SFD;
   cutlass::HostTensor<ElementSFD, LayoutTagD> reference_SFD;
   
@@ -1693,7 +2177,12 @@ struct HostCollectiveEpilogue {
       auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
       auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
       auto sfd_coord = [&] () {
+        if constexpr (IsKMajorSFD) {
           return cutlass::make_Coord(m_blks * Blk_MN{} * L, n_blks * Blk_SF{});
+        }
+        else {
+          return cutlass::make_Coord(m_blks * Blk_SF{} * L, n_blks * Blk_MN{});
+        }
       }();
       tensor_SFD.resize(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D));
       reference_SFD.resize(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false);
@@ -2062,7 +2551,7 @@ struct HostCollectiveEpilogue {
     auto SfD = [&](){
       if constexpr (IsBlockScaleSupported) {
         auto tensor = make_tensor(detail::make_iterator(reference_SFD.host_data()),
-          Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+          Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
         return tensor;
       }
       else {
@@ -3154,6 +3643,18 @@ bool TestSmall(double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::
     max_alignment_n = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
     max_alignment_m = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
   }
+  // Alignment for SFD
+  if constexpr (detail::IsSfdEpi<typename Gemm::GemmKernel::CollectiveEpilogue>::value) {
+    using GmemLayoutTagScalefactor = typename Gemm::GemmKernel::CollectiveEpilogue::FusionCallbacks::Operation::GmemLayoutTagScalefactor;
+    constexpr int SFDVecSize = Gemm::GemmKernel::CollectiveEpilogue::FusionCallbacks::Operation::SFVecSize;
+    if constexpr (cute::is_same_v<GmemLayoutTagScalefactor, cutlass::layout::RowMajor>) {
+      max_alignment_n = std::lcm(max_alignment_n, SFDVecSize);
+    }
+    else {
+      max_alignment_m = std::lcm(max_alignment_m, SFDVecSize);
+    }
+  }
+
   float waves[] = {0.5, 1.25, 2.5};
   int cluster_m = 1;
   int cluster_n = 1;
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
index 7d5609e5..76d0a704 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -552,10 +552,10 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlo
   static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
 
   using ElementSF = typename Gemm::GemmKernel::ElementSF;
-  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
-  using Blk_MN   = typename Sm100BlkScaledConfig::Blk_MN;
-  using Blk_SF   = typename Sm100BlkScaledConfig::Blk_SF;
-  using SfAtom   = typename Sm100BlkScaledConfig::SfAtom;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
   using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
   using InternalLayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
   using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
@@ -662,8 +662,8 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlo
       auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
       auto m_blks = cutlass::ceil_div(M, Blk_MN{});
       auto n_blks = cutlass::ceil_div(N, Blk_MN{});
-      layout_sfa_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
-      layout_sfb_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
+      layout_sfa_host.push_back(Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
+      layout_sfb_host.push_back(Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
     
       // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
       auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{}, k_blks * Blk_SF{});
@@ -1110,12 +1110,12 @@ struct HostCollectiveEpilogue {
   static constexpr SfStrategy SfGenStrategy              = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
   static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
   using ElementSFD = non_void_t<cute::remove_pointer_t<typename FusionOp::ElementBlockScaleFactor>, ElementD>;
-  using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
+  using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
                                           SFD_VectorSize
                                         >;
-  using Blk_MN = typename Sm100BlockScaledOutputConfig::Blk_MN;
-  using Blk_SF = typename Sm100BlockScaledOutputConfig::Blk_SF; 
-  using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
+  using Blk_MN = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
+  using Blk_SF = typename Sm1xxBlockScaledOutputConfig::Blk_SF; 
+  using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
   std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> tensors_SFD;
   std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> references_SFD;
   cutlass::DeviceAllocation<ElementSFD *> device_tensors_SFD;
@@ -1711,7 +1711,7 @@ struct HostCollectiveEpilogue {
     auto SfD = [&](){
       if constexpr (IsBlockScaleSupported) {
         auto tensor = make_tensor(detail::make_iterator(references_SFD[batch].host_data()),
-          Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+          Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
         return tensor;
       }
       else {
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..735b3760
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,174 @@
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm100_bssp
+  DEPENDS
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f32_f32_o
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f16_f16_o
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f16_nvf4_o
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f32_f32_q
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f16_f16_q
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f16_mxf8_q
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf4_f32_q
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf4_mxf4_f32_q
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf4_mxf4_f32_o
+  cutlass_test_unit_gemm_device_sm100_bssp_streamk
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f32_f32_o
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f16_f16_o
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_nvf4_nvf4_f32_f16_nvf4_o
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
+  sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f32_f32_q
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f16_f16_q
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf8_f32_f16_mxf8_q
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
+
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf4_mxf4_f32_o
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
+  sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
+
+  sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
+  sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf8_mxf4_f32_q
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
+  sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
+  sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_mxf4_mxf4_f32_q
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
+  sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
+  sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_bssp_streamk
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
+  sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
new file mode 100644
index 00000000..ae3dd8da
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
@@ -0,0 +1,1451 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
new file mode 100644
index 00000000..4468cc08
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
@@ -0,0 +1,1451 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
new file mode 100644
index 00000000..7a2b8b4a
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
new file mode 100644
index 00000000..59ea632d
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
new file mode 100644
index 00000000..e5162951
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
@@ -0,0 +1,1451 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
new file mode 100644
index 00000000..49669982
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
@@ -0,0 +1,1453 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {384, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
new file mode 100644
index 00000000..636139cd
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
new file mode 100644
index 00000000..78173b29
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
new file mode 100644
index 00000000..875385b1
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
new file mode 100644
index 00000000..07b8f150
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
new file mode 100644
index 00000000..d6909000
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
new file mode 100644
index 00000000..c9661953
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
new file mode 100644
index 00000000..024246b1
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {nnn_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_nnn_vs64in_vs64out
+// 2. 128x192_nnn_vs64in_vs64out
+// 3. 128x256_nnn_vs64in_vs64out
+// 4. 256x128_nnn_vs64in_vs64out
+// 5. 256x192_nnn_vs64in_vs64out
+// 6. 256x256_nnn_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
new file mode 100644
index 00000000..2c751282
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {nnt_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_nnt_vs64in_vs64out
+// 2. 128x192_nnt_vs64in_vs64out
+// 3. 128x256_nnt_vs64in_vs64out
+// 4. 256x128_nnt_vs64in_vs64out
+// 5. 256x192_nnt_vs64in_vs64out
+// 6. 256x256_nnt_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::ColumnMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
new file mode 100644
index 00000000..43eb3bea
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnn_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnn_vs64in_vs64out
+// 2. 128x192_tnn_vs64in_vs64out
+// 3. 128x256_tnn_vs64in_vs64out
+// 4. 256x128_tnn_vs64in_vs64out
+// 5. 256x192_tnn_vs64in_vs64out
+// 6. 256x256_tnn_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
new file mode 100644
index 00000000..14e48873
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnt_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnt_vs64in_vs64out [NOT SUPPORTED]
+// 2. 128x192_tnt_vs64in_vs64out
+// 3. 128x256_tnt_vs64in_vs64out
+// 4. 256x128_tnt_vs64in_vs64out [NOT SUPPORTED]
+// 5. 256x192_tnt_vs64in_vs64out
+// 6. 256x256_tnt_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
new file mode 100644
index 00000000..6a8608e4
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
new file mode 100644
index 00000000..67130534
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {ttn_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_ttn_vs64in_vs64out
+// 2. 128x192_ttn_vs64in_vs64out
+// 3. 128x256_ttn_vs64in_vs64out
+// 4. 256x128_ttn_vs64in_vs64out
+// 5. 256x192_ttn_vs64in_vs64out
+// 6. 256x256_ttn_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
new file mode 100644
index 00000000..9572cd45
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {ttt_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_ttt_vs64in_vs64out [NOT SUPPORTED]
+// 2. 128x192_ttt_vs64in_vs64out
+// 3. 128x256_ttt_vs64in_vs64out
+// 4. 256x128_ttt_vs64in_vs64out [NOT SUPPORTED]
+// 5. 256x192_ttt_vs64in_vs64out
+// 6. 256x256_ttt_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
new file mode 100644
index 00000000..4793d9c6
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
new file mode 100644
index 00000000..61294715
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
new file mode 100644
index 00000000..b6e73cb7
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
new file mode 100644
index 00000000..52e3a45e
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
new file mode 100644
index 00000000..a4c596cf
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs64in
+// 2. 128x192_tnn_vs64in
+// 3. 128x256_tnn_vs64in
+// 4. 256x128_tnn_vs64in
+// 5. 256x192_tnn_vs64in
+// 6. 256x256_tnn_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
new file mode 100644
index 00000000..1a00d07f
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs64in
+// 2. 128x192_tnt_vs64in
+// 3. 128x256_tnt_vs64in
+// 4. 256x128_tnt_vs64in
+// 5. 256x192_tnt_vs64in
+// 6. 256x256_tnt_vs64in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3. 
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
new file mode 100644
index 00000000..1c48ee8f
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnn_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnn_vs64in_vs64out
+// 2. 128x192_tnn_vs64in_vs64out
+// 3. 128x256_tnn_vs64in_vs64out
+// 4. 256x128_tnn_vs64in_vs64out
+// 5. 256x192_tnn_vs64in_vs64out
+// 6. 256x256_tnn_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 16;
+    constexpr int kAlignmentC = 16;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
new file mode 100644
index 00000000..6527b6a5
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
@@ -0,0 +1,614 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnt_InputVs64_OutputVs64}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnt_vs64in_vs64out
+// 2. 128x192_tnt_vs64in_vs64out
+// 3. 128x256_tnt_vs64in_vs64out
+// 4. 256x128_tnt_vs64in_vs64out
+// 5. 256x192_tnt_vs64in_vs64out
+// 6. 256x256_tnt_vs64in_vs64out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementPairB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+    using ElementSF = cutlass::float_ue8m0_t;
+
+    constexpr int kAlignmentA = 32;
+    constexpr int kAlignmentB = 16;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 64;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmMxf8f6f4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
new file mode 100644
index 00000000..4112130f
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs32in
+// 2. 128x192_tnn_vs32in
+// 3. 128x256_tnn_vs32in
+// 4. 256x128_tnn_vs32in
+// 5. 256x192_tnn_vs32in
+// 6. 256x256_tnn_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
new file mode 100644
index 00000000..b667e416
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs32in
+// 2. 128x192_tnt_vs32in
+// 3. 128x256_tnt_vs32in
+// 4. 256x128_tnt_vs32in
+// 5. 256x192_tnt_vs32in
+// 6. 256x256_tnt_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
new file mode 100644
index 00000000..fe87a252
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
@@ -0,0 +1,797 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnn_InputVs32_OutputVs32}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnn_vs32in_vs32out
+// 2. 128x192_tnn_vs32in_vs32out
+// 3. 128x256_tnn_vs32in_vs32out
+// 4. 256x128_tnn_vs32in_vs32out
+// 5. 256x192_tnn_vs32in_vs32out
+// 6. 256x256_tnn_vs32in_vs32out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
new file mode 100644
index 00000000..f3916d89
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
@@ -0,0 +1,798 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnt_InputVs32_OutputVs32}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnt_vs32in_vs32out
+// 2. 128x192_tnt_vs32in_vs32out
+// 3. 128x256_tnt_vs32in_vs32out
+// 4. 256x128_tnt_vs32in_vs32out
+// 5. 256x192_tnt_vs32in_vs32out
+// 6. 256x256_tnt_vs32in_vs32out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
new file mode 100644
index 00000000..c6e16ca2
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs32in
+// 2. 128x192_tnt_vs32in
+// 3. 128x256_tnt_vs32in
+// 4. 256x128_tnt_vs32in
+// 5. 256x192_tnt_vs32in
+// 6. 256x256_tnt_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e2m1_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {1536}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
new file mode 100644
index 00000000..f89539f2
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs32in
+// 2. 128x192_tnn_vs32in
+// 3. 128x256_tnn_vs32in
+// 4. 256x128_tnn_vs32in
+// 5. 256x192_tnn_vs32in
+// 6. 256x256_tnn_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
new file mode 100644
index 00000000..21d3b3d8
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
@@ -0,0 +1,756 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs32in
+// 2. 128x192_tnt_vs32in
+// 3. 128x256_tnt_vs32in
+// 4. 256x128_tnt_vs32in
+// 5. 256x192_tnt_vs32in
+// 6. 256x256_tnt_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {384, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
new file mode 100644
index 00000000..4787b0d4
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs32in
+// 2. 128x192_tnn_vs32in
+// 3. 128x256_tnn_vs32in
+// 4. 256x128_tnn_vs32in
+// 5. 256x192_tnn_vs32in
+// 6. 256x256_tnn_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
new file mode 100644
index 00000000..1a40c960
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs32in
+// 2. 128x192_tnt_vs32in
+// 3. 128x256_tnt_vs32in
+// 4. 256x128_tnt_vs32in
+// 5. 256x192_tnt_vs32in
+// 6. 256x256_tnt_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cute::Shape<_128, _8>;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
new file mode 100644
index 00000000..f8300b56
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnn_vs32in
+// 2. 128x192_tnn_vs32in
+// 3. 128x256_tnn_vs32in
+// 4. 256x128_tnn_vs32in
+// 5. 256x192_tnn_vs32in
+// 6. 256x256_tnn_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
new file mode 100644
index 00000000..4c07b52c
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
@@ -0,0 +1,756 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt_vs32in
+// 2. 128x192_tnt_vs32in
+// 3. 128x256_tnt_vs32in
+// 4. 256x128_tnt_vs32in
+// 5. 256x192_tnt_vs32in
+// 6. 256x256_tnt_vs32in
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::PerRowLinCombPerRowBiasEltAct<
+                cutlass::epilogue::thread::Clamp,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {384, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
new file mode 100644
index 00000000..7a2a9414
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
@@ -0,0 +1,798 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnn_InputVs32_OutputVs32}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnn_vs32in_vs32out
+// 2. 128x192_tnn_vs32in_vs32out
+// 3. 128x256_tnn_vs32in_vs32out
+// 4. 256x128_tnn_vs32in_vs32out
+// 5. 256x192_tnn_vs32in_vs32out
+// 6. 256x256_tnn_vs32in_vs32out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4. 
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::ColumnMajor;
+    using LayoutD = cutlass::layout::ColumnMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
new file mode 100644
index 00000000..2f1316e0
--- /dev/null
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
@@ -0,0 +1,798 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test Config
+// 1. layout {tnt_InputVs32_OutputVs32}
+// 2. tilesize {128x128, 128x192, 128x256, 256x128, 256x192, 256x256}
+
+// * Test list
+// 1. 128x128_tnt_vs32in_vs32out
+// 2. 128x192_tnt_vs32in_vs32out
+// 3. 128x256_tnt_vs32in_vs32out
+// 4. 256x128_tnt_vs32in_vs32out
+// 5. 256x192_tnt_vs32in_vs32out
+// 6. 256x256_tnt_vs32in_vs32out
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.2
+namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _192, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.2
+namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+    using ElementC = void;
+    using ElementD = cutlass::float_e2m1_t;
+    using ElementSF = cutlass::float_ue4m3_t;
+
+    constexpr int kAlignmentA = 64;
+    constexpr int kAlignmentB = 32;
+    constexpr int kAlignmentC = 32;
+    constexpr int kAlignmentD = 32;
+    constexpr int SFDVectorSize = 32;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _512>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassTag = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2SmNvf4;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmNvf4Sm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType,
+            cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+                cutlass::epilogue::thread::Clamp,
+                SFDVectorSize,
+                ElementD,
+                ElementEpilogueCompute,
+                ElementSF,
+                LayoutD,
+                ElementBias,
+                ElementC,
+                ElementEpilogueCompute>
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassTag,
+            ElementPairA, LayoutA, kAlignmentA,
+            ElementPairB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 3.2
+TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 3840}));
+}
+
+// 6.2
+TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
+  EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512, 3840}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
index 758c6475..8de63ed0 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
@@ -26,10 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#
-
-#
-
 if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_blockscaled
diff --git a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
index 8e3a43e6..35f7262a 100644
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
@@ -112,6 +112,60 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
   EXPECT_TRUE(pass);
 }
 
+/// A Row B Col
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x256x128_4x4x1) {
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape_MNK = Shape<_256,_64,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 4,
+      ElementD, GmemLayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, 16,
+      ElementB, GmemLayoutB, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+
 /// A Col B Row
 TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
   using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
diff --git a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
index 63fbca6d..811b565b 100644
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@@ -227,6 +227,62 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
   EXPECT_TRUE(pass);
 }
 
+/// A Row B Row
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x256x128_4x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape_MNK = Shape<_256,_64,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 16,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
 /// A Col B Col
 TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
   using ElementA = cutlass::float_e4m3_t;
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..334cdce4
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,80 @@
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm100_sp
+  DEPENDS
+  cutlass_test_unit_gemm_device_sm100_sp_general
+  cutlass_test_unit_gemm_device_sm100_sp_qmma_variance
+  cutlass_test_unit_gemm_device_sm100_sp_streamk
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_sp_general
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
+  sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
+  sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
+  sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
+  sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
+  sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_sp_qmma_variance
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_sp_gemm_f4_f4_f32_f16_f8_qmma.cu
+  sm100_sp_gemm_f4_f4_f32_f16_f16_qmma.cu
+  sm100_sp_gemm_f4_f4_f32_f32_f32_qmma.cu
+
+  sm100_sp_gemm_f6_f6_f32_f16_f8_qmma.cu
+  sm100_sp_gemm_f6_f6_f32_f16_f16_qmma.cu
+  sm100_sp_gemm_f6_f6_f32_f32_f32_qmma.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable_split_file(
+  cutlass_test_unit_gemm_device_sm100_sp_streamk
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_sp_gemm_f16_f16_f32_f32_f32_streamk.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
new file mode 100644
index 00000000..e5e4f3ff
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
@@ -0,0 +1,1258 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_64>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 8,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::half_t, cutlass::layout::RowMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 8,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f32_f32_streamk.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f32_f32_streamk.cu
new file mode 100644
index 00000000..ec527282
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f32_f32_streamk.cu
@@ -0,0 +1,565 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////// 128x128x64 //////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_1cta_f32_streamk, 128x128x64_1x1x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized1Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16n_f16t_f32n_tensorop_1cta_f32_streamk, 256x256x64_2x2x1) {
+  using LayoutATag = cutlass::layout::ColumnMajor;
+  using LayoutBTag = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_2,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized1Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_2cta_f32_streamk, 256x256x64_2x2x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_256,_128,_64>;
+  using ClusterShape = Shape<_2,_2,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16n_f16t_f32n_tensorop_2cta_f32_streamk, 512x512x64_4x4x1) {
+  using LayoutATag = cutlass::layout::ColumnMajor;
+  using LayoutBTag = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_256,_128,_64>;
+  using ClusterShape = Shape<_4,_4,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32n_tensorop_2cta_f32_streamk, 256x512x128_2x4x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_256,_128,_128>;
+  using ClusterShape = Shape<_2,_4,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_2cta_f32_streamk, 256x256x64_2x1x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_256,_256,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1536});
+  EXPECT_TRUE(result);
+}
+
+// Enable this after linearized scheduler is functional again.
+#if 0
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_1cta_f32_linearized, 128x128x64_1x1x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized1Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16n_f16t_f32n_tensorop_1cta_f32_linearized, 256x256x64_2x2x1) {
+  using LayoutATag = cutlass::layout::ColumnMajor;
+  using LayoutBTag = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_2,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized1Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_2cta_f32_linearized, 256x256x64_2x2x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_256,_128,_64>;
+  using ClusterShape = Shape<_2,_2,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16n_f16t_f32n_tensorop_2cta_f32_linearized, 512x512x64_4x4x1) {
+  using LayoutATag = cutlass::layout::ColumnMajor;
+  using LayoutBTag = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_256,_128,_64>;
+  using ClusterShape = Shape<_4,_4,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32n_tensorop_2cta_f32_linearized, 256x512x128_2x4x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_256,_128,_128>;
+  using ClusterShape = Shape<_2,_4,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_2cta_f32_linearized, 256x256x64_2x1x1) {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using MmaTileShape = Shape<_256,_256,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  constexpr int ALIGNMENT_C = 4;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, ALIGNMENT_C,
+      float, LayoutC, ALIGNMENT_C,
+      cutlass::epilogue::TmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutATag, 16,
+      cutlass::half_t, LayoutBTag, 8,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+      cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::LinearizedScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+#endif
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
new file mode 100644
index 00000000..92671e67
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
@@ -0,0 +1,1259 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_32>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_64>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            float, cutlass::layout::RowMajor, 8,
+            float, cutlass::layout::ColumnMajor, 4,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_32>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f16_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f16_qmma.cu
new file mode 100644
index 00000000..7819c33f
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f16_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f8_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f8_qmma.cu
new file mode 100644
index 00000000..b3a61011
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f16_f8_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f32_f32_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f32_f32_qmma.cu
new file mode 100644
index 00000000..91ac9c12
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f4_f4_f32_f32_f32_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e2m1_t;
+    using ElementB = cutlass::float_e2m1_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f16_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f16_qmma.cu
new file mode 100644
index 00000000..e89f844c
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f16_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f8_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f8_qmma.cu
new file mode 100644
index 00000000..1f3cc952
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f16_f8_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = cutlass::half_t;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::float_e4m3_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = cutlass::half_t;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f32_f32_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f32_f32_qmma.cu
new file mode 100644
index 00000000..602b07ca
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f6_f6_f32_f32_f32_qmma.cu
@@ -0,0 +1,705 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+// * Test list
+// 1. 128x128_tnt
+// 2. 128x256_tnt
+// 3. 256x128_tnt
+// 4. 256x256_tnt
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = float;
+    using ElementD = float;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+
+// 1. 
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_128, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized1Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _128, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+
+    using ElementA = cutlass::float_e3m2_t;
+    using ElementB = cutlass::float_e3m2_t;
+    using ElementC = void;
+    using ElementD = cutlass::half_t;
+
+    constexpr int kAlignmentA = 256;
+    constexpr int kAlignmentB = 128;
+    constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    constexpr int kAlignmentC = cute::is_same_v<ElementC, void> ? kAlignmentD : 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ProblemShape = Shape<int,int,int,int>;
+    using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+    using MmaTileShape = Shape<_256, _256, _256>;
+    using ArchTag = cutlass::arch::Sm100;
+    using OpClassEpilogue = cutlass::arch::OpClassSparseTensorOp;
+    using OpClassMainLoop = cutlass::arch::OpClassSparseTensorOp;
+    using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized2Sm;
+    using KernelScheduleType = cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100;
+    using ElementAccumulator = float;
+    using ElementEpilogueCompute = float;
+    using ElementBias = float;
+    using TileScheduler = cutlass::gemm::PersistentScheduler;
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassEpilogue,
+            MmaTileShape,
+            ClusterShape,
+            EpilogueTile,
+            ElementAccumulator,
+            ElementEpilogueCompute,
+            ElementC, LayoutC, kAlignmentC,
+            ElementD, LayoutD, kAlignmentD,
+            EpilogueScheduleType
+        >::CollectiveOp;
+
+    using StageCount = cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            ArchTag,
+            OpClassMainLoop,
+            ElementA, LayoutA, kAlignmentA,
+            ElementB, LayoutB, kAlignmentB,
+            ElementAccumulator,
+            MmaTileShape,
+            ClusterShape,
+            StageCount,
+            KernelScheduleType
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {256, 2560}));
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
new file mode 100644
index 00000000..12b22760
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
@@ -0,0 +1,1260 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::half_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
new file mode 100644
index 00000000..43a5222a
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
@@ -0,0 +1,1260 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            cutlass::half_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            cutlass::half_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 16,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            cutlass::float_e4m3_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
new file mode 100644
index 00000000..71a54d0b
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
@@ -0,0 +1,1254 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            float, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            float,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            float, float,
+            void, cutlass::layout::ColumnMajor, 4,
+            float, cutlass::layout::ColumnMajor, 4,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            float,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cutlass::float_e4m3_t, cutlass::layout::RowMajor, 32,
+            cutlass::float_e4m3_t, cutlass::layout::ColumnMajor, 16,
+            float,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
new file mode 100644
index 00000000..0ec92dd9
--- /dev/null
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
@@ -0,0 +1,1259 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../../../common/cutlass_unit_test.h"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/numeric_types.h"
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            int8_t,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+//6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 1,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 1.
+namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_64, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 2.
+namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_128, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 3.
+namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_192, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 4.
+namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_128, cute::_256, cute::_128>,
+            cute::Shape<cute::_1, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 5.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_64, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 6.
+namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_64, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 7.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_128, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 8.
+namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_128, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 9.
+namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_192, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 10.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_256, cute::_256>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 11.
+namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
+
+    using CollectiveEpilogue =
+        typename cutlass::epilogue::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            int32_t, float,
+            void, cutlass::layout::ColumnMajor, 1,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::epilogue::fusion::LinearCombination<
+            int8_t,
+            float,
+            void,
+            float
+            >
+        >::CollectiveOp;
+
+    using CollectiveMainloop =
+        typename cutlass::gemm::collective::CollectiveBuilder<
+            cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
+            int8_t, cutlass::layout::RowMajor, 32,
+            int8_t, cutlass::layout::ColumnMajor, 16,
+            int32_t,
+            cute::Shape<cute::_256, cute::_256, cute::_128>,
+            cute::Shape<cute::_2, cute::_1, cute::_1>,
+            cutlass::gemm::collective::StageCountAutoCarveoutEpi<CollectiveEpilogue>,
+            cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100
+        >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        cute::Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+}
+
+// 1.
+TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 2.
+TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 3.
+TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 4.
+TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 5.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 6.
+TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 7.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 8.
+TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 9.
+TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 10.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+// 11.
+TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+{
+  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
+
+  EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
+    1, 0,
+    test::gemm::device::CheckEquality::RELATIVE,
+    test::gemm::device::ScalarLoc::ON_DEVICE,
+    test::gemm::device::VectorScale::ENABLED,
+    {512}));
+}
+
+#endif
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
index 6b4519d7..fbe15b5e 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
@@ -26,9 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#
 
-#
 if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_tensorop
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
index 3fdf7f51..0c49cbf3 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-
-
 #include <iostream>
 
 #include "cutlass/cutlass.h"
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
index 884729ec..285eca44 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
@@ -26,9 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#
 
-#
 if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_tensorop_narrow_precision
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..596dd4ea
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,67 @@
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm120_bssp
+  DEPENDS
+  cutlass_test_unit_gemm_device_sm120_bssp_general
+  cutlass_test_unit_gemm_device_sm120_bssp_stream_k
+  cutlass_test_unit_gemm_device_sm120_bssp_epilogue_fusion
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_sm120_bssp_general
+
+  sm120_bssp_gemm_f4_f4_f32_tensor_op.cu
+  sm120_bssp_gemm_f6_f4_f32_tensor_op.cu
+  sm120_bssp_gemm_f8_f6_f32_tensor_op.cu
+  sm120_bssp_gemm_f4t_f4n_f4t_tensor_op.cu
+  sm120_bssp_gemm_f8t_f8n_f8t_tensor_op.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_sm120_bssp_epilogue_fusion
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+  sm120_bssp_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_sm120_bssp_stream_k
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+  sm120_bssp_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op.cu
new file mode 100644
index 00000000..073ab941
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op.cu
@@ -0,0 +1,251 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+namespace kernel_2 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+namespace kernel_3 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_3
+
+TEST(SM120_Device_Sparse_BlockScaled_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Sparse_BlockScaled_VS32_Gemm_e2m1t_e2m1n_f32n_tensorop_op_f32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_2::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/, true /*batched_gemm*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Sparse_BlockScaled_VS64_Gemm_e2m1t_e2m1n_f32n_tensorop_op_f32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_3::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/, true /*batched_gemm*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
new file mode 100644
index 00000000..8a2abd83
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_2 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+// D = clamp(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_3 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::Clamp, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_3
+
+namespace kernel_4 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  constexpr int SFVectorSize = 64;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+    cutlass::epilogue::thread::Clamp,
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFDTag,
+    ElementBias,
+    ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_4
+
+// D = clamp(alpha * accum + beta * C + per-row bias)
+// C: fp16
+// Acc: fp32
+// Bias: fp16
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 with SF VEC32
+namespace kernel_5 {
+
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+
+  static constexpr int kAlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int kAlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int kAlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ClusterShape = Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>;
+  using MainloopTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using EpilogueTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using ArchTag = cutlass::arch::Sm120;
+  using OpClassEpilogue = cutlass::arch::OpClassTensorOp;
+  using OpClassMainLoop = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+  using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueScheduleType = cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120;
+  using KernelScheduleType =  cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120;
+  using ElementAccumulator = float;
+  using ElementEpilogueCompute = float;
+  using ElementBias = cutlass::half_t;
+  using TileSchedulerTag = cutlass::gemm::PersistentScheduler;
+
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassEpilogue,
+      EpilogueTileShape,
+      ClusterShape,
+      EpilogueTile,
+      ElementAccumulator,
+      ElementEpilogueCompute,
+      ElementC, LayoutCTag, kAlignmentC,
+      ElementD, LayoutDTag, kAlignmentD,
+      EpilogueScheduleType
+      , cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+          cutlass::epilogue::thread::Clamp, 32, ElementD, float, cutlass::float_ue4m3_t, LayoutDTag, cutlass::half_t, ElementC, float>
+    >::CollectiveOp;
+
+  using StageCount = cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassMainLoop,
+      ElementA, LayoutATag, kAlignmentA,
+      ElementB, LayoutBTag, kAlignmentB,
+      ElementAccumulator,
+      MainloopTileShape,
+      ClusterShape,
+      StageCount,
+      KernelScheduleType
+    >::CollectiveOp;
+
+  template <class T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_5
+
+
+// D = alpha * accum + beta * C + per-row bias
+// C: fp16
+// Acc: fp32
+// Bias: fp16
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 with SF VEC32
+namespace kernel_6 {
+
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+
+  static constexpr int kAlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int kAlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int kAlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int kAlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ClusterShape = Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>;
+  using MainloopTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using EpilogueTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using ArchTag = cutlass::arch::Sm120;
+  using OpClassEpilogue = cutlass::arch::OpClassTensorOp;
+  using OpClassMainLoop = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+  using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueScheduleType = cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120;
+  using KernelScheduleType =  cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120;
+  using ElementAccumulator = float;
+  using ElementEpilogueCompute = float;
+  using ElementBias = cutlass::half_t;
+  using TileSchedulerTag = cutlass::gemm::PersistentScheduler;
+
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassEpilogue,
+      EpilogueTileShape,
+      ClusterShape,
+      EpilogueTile,
+      ElementAccumulator,
+      ElementEpilogueCompute,
+      ElementC, LayoutCTag, kAlignmentC,
+      ElementD, LayoutDTag, kAlignmentD,
+      EpilogueScheduleType
+      , cutlass::epilogue::fusion::LinCombPerRowBiasBlockScaleFactor<
+          32, ElementD, float, cutlass::float_ue4m3_t, LayoutDTag, cutlass::half_t, ElementC, float>
+    >::CollectiveOp;
+
+  using StageCount = cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassMainLoop,
+      ElementA, LayoutATag, kAlignmentA,
+      ElementB, LayoutBTag, kAlignmentB,
+      ElementAccumulator,
+      MainloopTileShape,
+      ClusterShape,
+      StageCount,
+      KernelScheduleType
+    >::CollectiveOp;
+
+  template <class T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_6
+
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Sparse_BlockScaled_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f32, 128x64x256_per_row_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Sparse_BlockScaled_VS32_Gemm_e2m1t_e2m1n_f32n_tensorop_op_f32, 128x128x256_alpha_beta_per_col_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = clamp(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Sparse_BlockScaled_VS64_Gemm_e2m1t_e2m1n_f32n_tensorop_op_f32, 128x128x256_alpha_beta_per_col_bias_clamp) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_3::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = clamp(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major_bias_clamp) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_4::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+
+// D = clamp(alpha * accum + beta * C + per-row bias)
+// C: fp16
+// Bias: fp16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 SF VEC32
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_bsf4_bs32_clamp, 128x128x256) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_5::Gemm,
+                                              false /*force_legacy_epilogue*/,
+                                              false /*apply_alignment_offset*/>(1.0, 0);
+  EXPECT_TRUE(result);
+}
+
+// D = alpha * accum + beta * C + per-row bias
+// C: fp16
+// Bias: fp16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 SF VEC32
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_bsf4_bs32, 128x128x256) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_6::Gemm,
+                                              false /*force_legacy_epilogue*/,
+                                              false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
new file mode 100755
index 00000000..59622b84
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 256;
+  static constexpr int AlignmentB = 128;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+
+TEST(SM120_Device_Sparse_Gemm_e2m1t_e2m1n_f32n_vs32_tensorop_op_f32_stream_k, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4t_f4n_f4t_tensor_op.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4t_f4n_f4t_tensor_op.cu
new file mode 100644
index 00000000..e755e716
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f4t_f4n_f4t_tensor_op.cu
@@ -0,0 +1,138 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  using LayoutSFDTag = cutlass::layout::RowMajor;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  static constexpr int SFVectorSize = 64;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::Clamp,
+      SFVectorSize,
+      ElementD,
+      ElementCompute,
+      ElementSF,
+      LayoutSFDTag,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutATag, AlignmentA,
+      ElementPairB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Acc2x4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_fe2m1t_tensor_op_f32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f6_f4_f32_tensor_op.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f6_f4_f32_tensor_op.cu
new file mode 100644
index 00000000..b82e71db
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f6_f4_f32_tensor_op.cu
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 96 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_BlockedScalar_Gemm_fe2m3t_fe2m1n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(2.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8_f6_f32_tensor_op.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8_f6_f32_tensor_op.cu
new file mode 100644
index 00000000..103f24a9
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8_f6_f32_tensor_op.cu
@@ -0,0 +1,122 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 256;
+  static constexpr int AlignmentB = 128;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledSparseTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+
+TEST(SM120_Device_Sparse_BlockScaled_Gemm_fe5m2t_fe2m3n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8t_f8n_f8t_tensor_op.cu b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8t_f8n_f8t_tensor_op.cu
new file mode 100644
index 00000000..93bf9567
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/sm120_bssp_gemm_f8t_f8n_f8t_tensor_op.cu
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  using LayoutSFDTag = cutlass::layout::RowMajor;
+
+  using ElementA =  cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB =  cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  constexpr int kAlignmentA = 32;
+  constexpr int kAlignmentB = 16;
+  constexpr int kAlignmentC = 1;
+  constexpr int kAlignmentD = 4;
+
+  using ProblemShape = Shape<int,int,int,int>;
+  using ClusterShape = Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>;
+  using MainloopTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using EpilogueTileShape = Shape<cute::Int<128>, cute::Int<128>, cute::Int<256>>;
+  using ArchTag = cutlass::arch::Sm120;
+  using OpClassEpilogue = cutlass::arch::OpClassTensorOp;
+  using OpClassMainLoop = cutlass::arch::OpClassBlockScaledSparseTensorOp;
+  using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueScheduleType = cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120;
+  using KernelScheduleType =  cutlass::gemm::KernelSparseTmaWarpSpecializedMxf8f6f4Acc2x4Sm120;
+  using ElementAccumulator = float;
+  using ElementEpilogueCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using TileScheduler = void;
+  static constexpr int SFVectorSize = 64;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::Clamp,
+      SFVectorSize,
+      ElementD,
+      ElementCompute,
+      ElementSF,
+      LayoutSFDTag,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassEpilogue,
+      EpilogueTileShape,
+      ClusterShape,
+      EpilogueTile,
+      ElementAccumulator,
+      ElementEpilogueCompute,
+      ElementC, LayoutCTag, kAlignmentC,
+      ElementD, LayoutDTag, kAlignmentD,
+      EpilogueScheduleType
+      , FusionOperation
+    >::CollectiveOp;
+
+  using StageCount = cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OpClassMainLoop,
+      ElementA, LayoutATag, kAlignmentA,
+      ElementB, LayoutBTag, kAlignmentB,
+      ElementAccumulator,
+      MainloopTileShape,
+      ClusterShape,
+      StageCount,
+      KernelScheduleType
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        TileScheduler
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..380b4aa4
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+#
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm120_bs
+  DEPENDS
+  cutlass_test_unit_bs_gemm_device_tensorop_epilogue_fusion_sm120
+  cutlass_test_unit_bs_gemm_device_tensorop_sm120
+  cutlass_test_unit_bs_gemm_device_tensorop_sm120_stream_k
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_bs_gemm_device_tensorop_epilogue_fusion_sm120
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm120_bs_gemm_f4_f4_f32_f32_epilogue_fusion.cu
+  sm120_bs_gemm_f4_f4_f32_f4_epilogue_fusion.cu
+  sm120_bs_gemm_f4_f4_f32_bf16_epilogue_fusion.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_bs_gemm_device_tensorop_sm120
+
+  sm120_bs_gemm_f4_f4_f32_bf16.cu
+  sm120_bs_gemm_f4_f4_f32_f16.cu
+  sm120_bs_gemm_f4_f4_f32_f32.cu
+  sm120_bs_gemm_f4_f4_f32_f32_narrow_output.cu
+  sm120_bs_gemm_f4_f4_f32_epilogue.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_bs_gemm_device_tensorop_sm120_stream_k
+  sm120_bs_gemm_f4_f4_f32_f32_stream_k.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16.cu
new file mode 100644
index 00000000..e64d6483
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16.cu
@@ -0,0 +1,188 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using TileSchedulerTag = cutlass::gemm::PersistentScheduler; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        TileSchedulerTag>;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using TileSchedulerTag = cutlass::gemm::PersistentScheduler; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        TileSchedulerTag>;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_3
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_bf16, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs32_tensor_op_bf16, 128x128x128) {
+  bool result = test::gemm::device::TestSmall<kernel_3::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16_epilogue_fusion.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16_epilogue_fusion.cu
new file mode 100644
index 00000000..ef7a1e5f
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_bf16_epilogue_fusion.cu
@@ -0,0 +1,385 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_1
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_2
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_3
+
+// Aux = alpha * accum + beta * C + per-row bias
+// D = gelu(Aux)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_4 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementAux = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+                LayoutC, cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementAux, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_4
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_bf16n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_1::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_bf16n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_bf16n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_row_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_3::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// Aux = alpha * accum + beta * C + per-row bias
+// D = gelu(Aux)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_bf16n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_row_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_4::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_epilogue.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_epilogue.cu
new file mode 100644
index 00000000..b2f5b878
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_epilogue.cu
@@ -0,0 +1,590 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_1
+
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 32;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_3
+
+namespace kernel_4 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 32;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_4
+
+namespace kernel_5 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_5
+
+namespace kernel_6 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 32;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_6
+
+namespace kernel_7 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = void;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFD,
+    ElementC
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_7
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue_vs16, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs32_tensor_op_f32_f32_epilogue_vs32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_2::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// ==== mixed datatypes for C (fp16/bf16) / D (fp32) matrices ==== //
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f16_f32_epilogue_vs16, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_3::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs32_tensor_op_f16_f32_epilogue_vs32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_4::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_bf16_f32_epilogue_vs16, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_5::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs32_tensor_op_bf16_f32_epilogue_vs32, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_6::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs32_tensor_op_void_f32_epilogue_vs32, 128x128x256) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_7::Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f16.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f16.cu
new file mode 100644
index 00000000..ad0d8a2e
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f16.cu
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using TileSchedulerTag = cutlass::gemm::PersistentScheduler; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        TileSchedulerTag>;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+        >;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using TileSchedulerTag = cutlass::gemm::PersistentScheduler; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        TileSchedulerTag>;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_3
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f16, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs32_tensor_op_f16_static_sched, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_2::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs32_tensor_op_f16, 128x128x128) {
+  bool result = test::gemm::device::TestSmall<kernel_3::Gemm>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32.cu
new file mode 100644
index 00000000..02421673
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32.cu
@@ -0,0 +1,122 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs32_tensor_op_f32_static_sched, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_epilogue_fusion.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_epilogue_fusion.cu
new file mode 100644
index 00000000..16c69bbf
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_epilogue_fusion.cu
@@ -0,0 +1,544 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_1
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_2
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::GELU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_3
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_4 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_4
+
+// D = clamp(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_5 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Clamp, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_5
+
+// D = clamp(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_6 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
+      cutlass::epilogue::thread::Clamp, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_6
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_per_row_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_1::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_per_row_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_3::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_4::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = clamp(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_per_row_bias_clamp) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_5::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = clamp(alpha * accum + beta * C + per-col bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_clamp) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_6::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_narrow_output.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_narrow_output.cu
new file mode 100644
index 00000000..fd584cee
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_narrow_output.cu
@@ -0,0 +1,218 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m3_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using LayoutSFD = cutlass::layout::RowMajor;
+  using ElementBias = cutlass::bfloat16_t;
+  using GmemLayoutSFC = cutlass::layout::RowMajor;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::ReLU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query 
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m3_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 32;
+  using LayoutSFD = cutlass::layout::RowMajor;
+  using ElementBias = cutlass::bfloat16_t;
+  using GmemLayoutSFC = cutlass::layout::RowMajor;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::ReLU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+      ,FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::PersistentScheduler>; // both void (default) and PersistentScheduler map to dynamic scheduler with CLC query 
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_fe2m3n, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs32_tensor_op_f32_fe2m3n, 128x128x256) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false, false>(1.0, 0);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_stream_k.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_stream_k.cu
new file mode 100644
index 00000000..7f1132d2
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f32_stream_k.cu
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler
+    >;    
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_f32n_vs16_tensor_op_f32_stream_k, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f4_epilogue_fusion.cu b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f4_epilogue_fusion.cu
new file mode 100644
index 00000000..1c864f3e
--- /dev/null
+++ b/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/sm120_bs_gemm_f4_f4_f32_f4_epilogue_fusion.cu
@@ -0,0 +1,590 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * accum + beta * C + per-row bias
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_1 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_1
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_2 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::ReLU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+
+} // kernel_2
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_3 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::GELU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_3
+
+// D = alpha * accum + beta * C + per-col bias
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_4 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_4
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_5 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  using LayoutSFD = cutlass::layout::ColumnMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::ReLU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_5
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+namespace kernel_6 {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue4m3_t;
+  using ElementBias = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutSFD = cutlass::layout::RowMajor;
+
+  using ElementPairA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementPairB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  constexpr int SFVectorSize = 16;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      cutlass::epilogue::thread::GELU,
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF, LayoutSFD,
+      ElementBias,
+      ElementC>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementPairA, LayoutA, AlignmentA,
+      ElementPairB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_6
+
+////////////////////////////////////////////////////////////////////
+// EVT for epilogue with scale factor
+////////////////////////////////////////////////////////////////////
+
+// D = alpha * accum + beta * C + per-row bias
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_row_bias) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_1::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_row_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-row bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_row_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_3::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = alpha * accum + beta * C + per-col bias
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_4::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = relu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_5::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-col bias)
+// C: bf16
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: bf16
+TEST(SM120_Device_Blockscaled_Gemm_fe2m1t_fe2m1n_fe2m1t_vs16_tensor_op_f32_f32_epilogue, 128x128x256_alpha_beta_per_col_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_6::Gemm, false, false>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..80700cf4
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,68 @@
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm120_sptensorop
+  DEPENDS
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120_stream_k
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120_epilogue_fusion
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120
+
+  sm120_sparse_gemm_f4_f4_f32_tensor_op.cu
+  sm120_sparse_gemm_f6_f4_f32_tensor_op.cu
+  sm120_sparse_gemm_f8_f6_f32_tensor_op.cu
+  sm120_sparse_gemm_f4_f4_f16_tensor_op.cu
+  sm120_sparse_gemm_f6_f4_f16_tensor_op.cu
+  sm120_sparse_gemm_f8_f6_f16_tensor_op.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120_epilogue_fusion
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+  sm120_sparse_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_sparse_gemm_device_tensorop_sm120_stream_k
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+  sm120_sparse_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f16_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f16_tensor_op.cu
new file mode 100644
index 00000000..546092d6
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f16_tensor_op.cu
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f16n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op.cu
new file mode 100644
index 00000000..ad2314cd
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op.cu
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, AlignmentC,
+      float, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
new file mode 100644
index 00000000..53efb765
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_epilogue_fusion.cu
@@ -0,0 +1,593 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, ElementD, ElementCompute, ElementBias, ElementAccumulator, ElementCompute>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutCTag, AlignmentC,
+      float, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+// D = alpha * accum + beta * C
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+namespace kernel_2 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  constexpr int SFVectorSize = 64;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      SFVectorSize,
+      ElementD, 
+      ElementCompute, 
+      ElementSF,
+      LayoutSFDTag,
+      ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelScheduleSparseF8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_2
+
+// D = alpha * accum + beta * C + per-column bias
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+namespace kernel_3 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  constexpr int SFVectorSize = 32;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasBlockScaleFactor<
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFDTag,
+    ElementBias,
+    ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelScheduleSparseF8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_3
+
+// D = relu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+namespace kernel_4 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  constexpr int SFVectorSize = 64;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+    cutlass::epilogue::thread::ReLU,
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFDTag,
+    ElementBias,
+    ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelScheduleSparseF8f6f4Sm120
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_4
+
+// D = gelu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+namespace kernel_5 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  constexpr int SFVectorSize = 32;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+    cutlass::epilogue::thread::GELU,
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFDTag,
+    ElementBias,
+    ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelScheduleSparseF8f6f4Sm120
+    >::CollectiveOp;
+
+  template <class T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_5
+
+// D = gelu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 with SF VEC16
+namespace kernel_6 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  using LayoutSFDTag = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::bfloat16_t;
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementSF = cutlass::float_ue8m0_t;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value;     // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  constexpr int SFVectorSize = 16;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltActBlockScaleFactor<
+    cutlass::epilogue::thread::GELU,
+    SFVectorSize,
+    ElementD,
+    ElementCompute,
+    ElementSF,
+    LayoutSFDTag,
+    ElementBias,
+    ElementC
+  >;
+
+  using TileShape = Shape<_128,_128,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutCTag, AlignmentC,
+      ElementD, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelScheduleSparseF8f6f4Sm120
+    >::CollectiveOp;
+
+  template <class T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_6
+
+// D = relu(alpha * accum + beta * C + per-row bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// D: fp32
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f32, 128x64x256_per_row_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = alpha * accum + beta * C
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_2::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = alpha * accum + beta * C + per-column bias
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major_bias) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_3::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = relu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major_bias_relu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_4::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_5::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+// D = gelu(alpha * accum + beta * C + per-column bias)
+// C: fp32
+// Bias: bf16
+// Acc: fp32
+// Scale (alpha, beta): fp32
+// Scale factor: fp8
+// D: fp4 SF VEC16
+TEST(SM120_Device_Sparse_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f4, 128x128x256_column_major_sf_vec16_bias_gelu) {
+  bool result = test::gemm::device::TestSmallFusion<kernel_6::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
new file mode 100755
index 00000000..72b93f23
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f4_f4_f32_tensor_op_f32_stream_k.cu
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutCTag = cutlass::layout::ColumnMajor;
+  using LayoutDTag = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+
+  static constexpr int AlignmentA = 64 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutCTag, AlignmentC,
+      float, LayoutDTag, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue,
+        cutlass::gemm::StreamKScheduler
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_e2m1t_e2m1n_f32n_vs64_tensor_op_f32_stream_k, 128x128x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f16_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f16_tensor_op.cu
new file mode 100644
index 00000000..ed79882f
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f16_tensor_op.cu
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e3m2_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 96 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe2m3t_fe2m1n_f16n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(2.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f32_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f32_tensor_op.cu
new file mode 100644
index 00000000..5cd79109
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f6_f4_f32_tensor_op.cu
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+
+  static constexpr int AlignmentA = 96 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, AlignmentC,
+      float, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe2m3t_fe2m1n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(2.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f16_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f16_tensor_op.cu
new file mode 100644
index 00000000..5392c00f
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f16_tensor_op.cu
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+} // kernel_1
+
+TEST(SM120_Device_Sparse_Gemm_fe5m2t_fe3m2n_f16n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(2.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f32_tensor_op.cu b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f32_tensor_op.cu
new file mode 100644
index 00000000..e24dfe15
--- /dev/null
+++ b/test/unit/gemm/device/sm120_sparse_tensorop_gemm/sm120_sparse_gemm_f8_f6_f32_tensor_op.cu
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace kernel_1 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, AlignmentC,
+      float, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_1
+
+namespace kernel_2 {
+  using LayoutATag = cutlass::layout::RowMajor;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  
+  using TileShape = Shape<_128,_64,_256>;  // M, N, K
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  
+  static constexpr int AlignmentA = 16 * 8 * 2 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes with sparse ratio 4:2.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, AlignmentC,
+      float, LayoutD, AlignmentD,
+      cutlass::epilogue::SparseTmaWarpSpecializedCooperativeSm120
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassSparseTensorOp,
+      ElementA, LayoutATag, AlignmentA,
+      ElementB, LayoutBTag, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  template <typename T>
+  struct dummy {
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue
+    >;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  };
+  using GemmKernel = typename dummy<void>::GemmKernel;
+  using Gemm = typename dummy<void>::Gemm;
+
+} // kernel_2
+
+TEST(SM120_Device_Sparse_Gemm_fe5m2t_fe2m3n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_1::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Sparse_BlockScaled_Gemm_fe5m2t_fe2m3n_f32n_tensor_op_f32, 128x64x256) {
+  bool result = test::gemm::device::TestSmall<kernel_2::Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm120_tensorop_gemm/CMakeLists.txt
new file mode 100644
index 00000000..91f98365
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/CMakeLists.txt
@@ -0,0 +1,62 @@
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm120_tensorop
+  DEPENDS
+  cutlass_test_unit_gemm_device_tensorop_f32_sm120
+  cutlass_test_unit_gemm_device_tensorop_f16_sm120
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f32_sm120
+
+  sm120_gemm_f4_f6_f32_tensor_op_narrow_output.cu
+  sm120_gemm_f4_f6_f32_tensor_op.cu
+  sm120_gemm_f4_f8_f32_tensor_op.cu
+  sm120_gemm_f6_f8_f32_tensor_op.cu
+  sm120_gemm_f4_f4_f32_tensor_op.cu
+  sm120_gemm_f6_f6_f32_tensor_op.cu
+  sm120_gemm_f8_f8_f32_tensor_op.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f16_sm120
+
+  sm120_gemm_f4_f6_f16_tensor_op_narrow_output.cu
+  sm120_gemm_f4_f6_f16_tensor_op.cu
+  sm120_gemm_f4_f8_f16_tensor_op.cu
+  sm120_gemm_f6_f8_f16_tensor_op.cu
+  sm120_gemm_f4_f4_f16_tensor_op.cu
+  sm120_gemm_f6_f6_f16_tensor_op.cu
+  sm120_gemm_f8_f8_f16_tensor_op.cu
+)
+
+endif()
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f16_tensor_op.cu
new file mode 100644
index 00000000..458ecf65
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f16_tensor_op.cu
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe2m1n_f16n_void_f32_tensor_op, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m1t_fe2m1n_f16n_void_f16_tensor_op, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = void;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m1t_fe2m1n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m1t_fe2m1n_f16n_tensor_op_f16, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f32_tensor_op.cu
new file mode 100644
index 00000000..f83637d1
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f4_f32_tensor_op.cu
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe2m1n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op.cu
new file mode 100644
index 00000000..5c34b5e7
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe3m2n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m3t_fe2m1n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op_narrow_output.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op_narrow_output.cu
new file mode 100644
index 00000000..6e93ea3c
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f16_tensor_op_narrow_output.cu
@@ -0,0 +1,169 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#include "cutlass/arch/config.h"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe3m2n_f16n_tensor_op_fe2m3n, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m3_t;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+  result = test::gemm::device::TestSmall<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m3t_fe2m1n_f16n_tensor_op_fe2m1t, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+  result = test::gemm::device::TestSmall<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op.cu
new file mode 100644
index 00000000..697cf3e3
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe3m2n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m3t_fe2m1n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op_narrow_output.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op_narrow_output.cu
new file mode 100644
index 00000000..eb2278be
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f6_f32_tensor_op_narrow_output.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#include "cutlass/arch/config.h"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe3m2n_f32n_tensor_op_fe2m3n, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+  result = test::gemm::device::TestSmall<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe2m3t_fe2m1n_f32n_tensor_op_fe2m1t, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = cutlass::float_e2m1_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+  result = test::gemm::device::TestSmall<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f16_tensor_op.cu
new file mode 100644
index 00000000..1c969677
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f16_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe5m2n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe4m3t_fe2m1n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f32_tensor_op.cu
new file mode 100644
index 00000000..7d761104
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f4_f8_f32_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe2m1t_fe5m2n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 64 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 64 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe4m3t_fe2m1n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 64 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 64 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f16_tensor_op.cu
new file mode 100644
index 00000000..a62071ed
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f16_tensor_op.cu
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe3m2t_fe3m2n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e3m2_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f32_tensor_op.cu
new file mode 100644
index 00000000..f5289211
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f6_f32_tensor_op.cu
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe3m2t_fe3m2n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e3m2_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f16_tensor_op.cu
new file mode 100644
index 00000000..fd1f79a8
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f16_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe3m2t_fe4m3n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e3m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe4m3t_fe3m2n_f16n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f32_tensor_op.cu
new file mode 100644
index 00000000..f5b5db4a
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f6_f8_f32_tensor_op.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe3m2t_fe4m3n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e3m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 96 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 96 bytes.
+  static constexpr int AlignmentB = 16 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 16 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM120_Device_Gemm_fe4m3t_fe3m2n_f32n_tensor_op_f32, 128x64x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e3m2_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int AlignmentA = 16 * 8 / cutlass::sizeof_bits<ElementA>::value; // Align to 16 bytes.
+  static constexpr int AlignmentB = 96 * 8 / cutlass::sizeof_bits<ElementB>::value; // Align to 96 bytes.
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f16_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f16_tensor_op.cu
new file mode 100644
index 00000000..70b7d67c
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f16_tensor_op.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe4m3t_fe4m3n_f16n_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = half_t;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int Alignment = 16;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, Alignment,
+      ElementB, LayoutB, Alignment,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f32_tensor_op.cu b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f32_tensor_op.cu
new file mode 100644
index 00000000..e5d0c46a
--- /dev/null
+++ b/test/unit/gemm/device/sm120_tensorop_gemm/sm120_gemm_f8_f8_f32_tensor_op.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+#if (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM120_Device_Gemm_fe4m3t_fe4m3n_f32n_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  static constexpr int Alignment = 16;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using TileShape = Shape<_128,_64,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, Alignment,
+      ElementB, LayoutB, Alignment,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED))
diff --git a/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu b/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
index 29dd176b..d2b18e75 100644
--- a/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
+++ b/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
@@ -33,10 +33,6 @@
     \brief Unit test for the PipelineCLCFetchAsync class
 */
 
-//
-
-//
-
 #define KERNEL_DBG_TRACE false
 
 #include <cuda/atomic>
@@ -73,7 +69,7 @@ template <uint32_t Stages, typename ClusterShape>
 struct SharedStorage
 {
   alignas(16) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::CLCResponse clc_response[Stages];
-  alignas(8) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::PipelineStorage storage ;
+  alignas(16) typename PipelineCLCFetchAsync<Stages, ClusterShape>::SharedStorage storage;
 };
 
 //////////////////// Kernel /////////////////////////
@@ -90,7 +86,7 @@ void pipeline_device(int *d_workerCount)
 
   using SharedStorage = SharedStorage<Stages, ClusterShape>;
   using Scheduler = PersistentTileSchedulerSm100<ClusterShape, Stages>;
-  using TileSchedulingPipeline = typename Scheduler::Pipeline;
+  using TileSchedulingPipeline = PipelineCLCFetchAsync<Stages, ClusterShape>;
   SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
 
   // Logistics
@@ -110,7 +106,6 @@ void pipeline_device(int *d_workerCount)
   dim3 block_id_in_cluster = cute::block_id_in_cluster();
   // mbarrier.init
   TileSchedulingPipeline scheduler_pipeline(shared_storage.storage, params );
-  Scheduler scheduler(&shared_storage.clc_response[0], typename Scheduler::Params{}, block_id_in_cluster);
 
   // Ensure All CTAs in Cluster have completed init before issuing commits
   cute::cluster_arrive_relaxed();
@@ -136,7 +131,16 @@ void pipeline_device(int *d_workerCount)
     // Producer
     if (is_producer) {
       // Only 1 thread of the entire cluster issues the query.
-      scheduler_pipe_state_write = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_state_write);
+      uint32_t mbarrier_addr = scheduler_pipeline.producer_get_barrier(scheduler_pipe_state_write);
+
+      // Wait for clcID buffer to become empty with a flipped phase
+      scheduler_pipeline.producer_acquire(scheduler_pipe_state_write);
+
+      if (cute::elect_one_sync()) {
+        Scheduler::issue_clc_query(scheduler_pipe_state_write, mbarrier_addr, shared_storage.clc_response);
+      }
+
+      ++scheduler_pipe_state_write;
     }
 
     // Consumers
@@ -151,7 +155,8 @@ void pipeline_device(int *d_workerCount)
     // Union of all consumers. Note that the producer here is its own consumer.
     if (is_producer || is_consumer) {
       scheduler_pipeline.consumer_wait(scheduler_pipe_state);
-      work_tile_info = scheduler.get_current_work(scheduler_pipe_state);
+      uint32_t smem_addr = cute::cast_smem_ptr_to_uint(&shared_storage.clc_response[scheduler_pipe_state.index()]);
+      work_tile_info = Scheduler::work_tile_info_from_clc_response(smem_addr);
       scheduler_pipeline.consumer_release(scheduler_pipe_state);
       ++scheduler_pipe_state;
 
diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h
index 29f70d3a..4dad3125 100644
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@@ -127,9 +127,13 @@ template <typename OperatorClass> struct ArchMap<arch::Sm100, OperatorClass> {
 
 template <> struct ArchMap<arch::Sm100, arch::OpClassTensorOp> {
   static int const kMin = 100;
-  static int const kMax = 100;
+  static int const kMax = 101;
 };
 
+template <typename OperatorClass> struct ArchMap<arch::Sm120, OperatorClass> {
+  static int const kMin = 120;
+  static int const kMax = 120;
+};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index e279f1fd..9e70e9eb 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -323,12 +323,11 @@ struct GemmUniversalArguments {
   int swizzle_size{1};
   int split_k_slices{1};
 
-  // For mixed input dtype kernels
-  bool is_mixed_dtype{false};
+  // For SM90 mixed input dtype kernels
+  bool is_sm90_mixed_dtype{false};
   Sm90MixedInputWiderOperand wider_operand{Sm90MixedInputWiderOperand::B};
   bool generate_scale_and_zero{false};
   bool generate_dequantized_AB{false};
-  bool *dequantized_AB_ready{nullptr};  // Carry the info back to gemm_operation_profiler.cu
   void *Scale{nullptr};                 // Scale tensor
   void *Zero{nullptr};                  // Zero tensor
   void *dequantized_AB{nullptr};        // Dequantized A or B tensor for verification
diff --git a/tools/library/src/block_scaled_gemm_operation_3x.hpp b/tools/library/src/block_scaled_gemm_operation_3x.hpp
index d1cd3517..c96b9a22 100644
--- a/tools/library/src/block_scaled_gemm_operation_3x.hpp
+++ b/tools/library/src/block_scaled_gemm_operation_3x.hpp
@@ -72,7 +72,7 @@ public:
   using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
   using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
 
-  using Sm100BlkScaledConfig =  typename CollectiveMainloop::Sm100BlkScaledConfig;
+  using Sm1xxBlkScaledConfig =  typename CollectiveMainloop::Sm1xxBlkScaledConfig;
     
   static constexpr bool epilogue_scalefactor_generation = not cute::is_same_v<typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
   static constexpr int32_t SFD_VectorSize = epilogue_scalefactor_generation ? ThreadEpilogueOp::SFVecSize : SFVecSize;
@@ -315,8 +315,8 @@ protected:
         arguments->ldc, arguments->batch_stride_C);
     operator_args.epilogue.dD = operator_args.epilogue.dC;
 
-    operator_args.mainloop.layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(operator_args.problem_shape);
-    operator_args.mainloop.layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(operator_args.problem_shape);
+    operator_args.mainloop.layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(operator_args.problem_shape);
+    operator_args.mainloop.layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(operator_args.problem_shape);
 
     /* Query device SM count to pass onto the kernel as an argument, where needed */
     operator_args.hw_info.sm_count = arguments->sm_count;
diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp
index 704edad3..2c1d1794 100644
--- a/tools/library/src/gemm_operation_3x.hpp
+++ b/tools/library/src/gemm_operation_3x.hpp
@@ -170,13 +170,12 @@ public:
   using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
   using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
 
-  
   static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
 
   static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
 
   static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
-                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB), 
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
                 "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
 
   static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
@@ -252,13 +251,13 @@ protected:
   };
 
   template<template<int, class, class> class Policy, int Stages, class ClusterShape, class KernelSchedule>
-  static constexpr bool is_mixed_dtype_mainloop_(Policy<Stages, ClusterShape, KernelSchedule> policy) {
+  static constexpr bool is_sm90_mixed_dtype_mainloop_(Policy<Stages, ClusterShape, KernelSchedule> policy) {
     return (cute::is_same_v<Policy<Stages, ClusterShape, KernelSchedule>,
                             cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>>);
   }
 
   template <class DispatchPolicy>
-  static constexpr bool is_mixed_dtype_mainloop_(DispatchPolicy) {
+  static constexpr bool is_sm90_mixed_dtype_mainloop_(DispatchPolicy) {
     return false;
   }
 
@@ -380,7 +379,6 @@ protected:
       arguments->batch_count);
 
     // update arguments
-    
 
     if constexpr (IsRuntimeDataType) {
       using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
@@ -390,7 +388,7 @@ protected:
 
       std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
           {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
-          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2}, 
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2},
           {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
           {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
       };
@@ -412,10 +410,9 @@ protected:
 
     }
     else {
-    
-    operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
-    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
-    } 
+      operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    }
     operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
     operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
 
@@ -441,11 +438,11 @@ protected:
     operator_args.epilogue.dD = operator_args.epilogue.dC;
 
     using MainloopPolicy = typename CollectiveMainloop::DispatchPolicy;
-    if constexpr(is_mixed_dtype_mainloop_(MainloopPolicy{})) {
-      int problem_m = arguments->problem_size.m();
-      int problem_n = arguments->problem_size.n();
-      int problem_k = arguments->problem_size.k();
-      int options_l = arguments->batch_count;
+    if constexpr(is_sm90_mixed_dtype_mainloop_(MainloopPolicy{})) {
+      const int problem_m = arguments->problem_size.m();
+      const int problem_n = arguments->problem_size.n();
+      const int problem_k = arguments->problem_size.k();
+      const int options_l = arguments->batch_count;
 
       constexpr Sm90MixedInputWiderOperand wider_operand =
         (cutlass::sizeof_bits<ElementA>::value > cutlass::sizeof_bits<ElementB>::value) ?
@@ -455,100 +452,118 @@ protected:
 
       constexpr bool has_scale = !std::is_same_v<typename CollectiveMainloop::ElementScale, void>;
       constexpr bool has_zero  = !std::is_same_v<typename CollectiveMainloop::ElementZero,  void>;
-      if constexpr(has_scale) {
-        int options_g = problem_k;
-        int scale_k = (problem_k + options_g - 1) / options_g;
 
-        constexpr bool is_A4B8 = (
-          cutlass::is_same_v<ElementA, cutlass::int4b_t> &&
-          (cutlass::is_same_v<ElementB, cutlass::float_e4m3_t> ||
-           cutlass::is_same_v<ElementB, cutlass::float_e5m2_t>));
-        constexpr bool is_A8B4 = (
-          cutlass::is_same_v<ElementB, cutlass::int4b_t> &&
-          (cutlass::is_same_v<ElementA, cutlass::float_e4m3_t> ||
-           cutlass::is_same_v<ElementA, cutlass::float_e5m2_t>));
-        constexpr bool is_int4_x_fp8 = is_A4B8 || is_A8B4;
+      const int options_g = problem_k;
+      const int scale_k = (problem_k + options_g - 1) / options_g;
 
-        // In int4 * fp8, ElementScale is a cutlass::Array, need to take out it's real element
-        using ElementScaleMainloop = typename CollectiveMainloop::ElementScale;
-        using ElementScale = typename UnderlyingElement<typename CollectiveMainloop::ElementScale>::type;
-        using StrideS = typename CollectiveMainloop::StrideScale;
-        // In ScaleOnly mode, we have allocated the same size of memory for arguments->Z and arguments->S
-        using ElementZero = std::conditional_t<
-          has_zero,
-          typename CollectiveMainloop::ElementZero,
-          ElementScale
-        >;
-        const int SZ_1st_dim = (wider_operand == Sm90MixedInputWiderOperand::A) ? problem_n : problem_m;
-        const size_t SZ_size = static_cast<size_t>(SZ_1st_dim * scale_k * options_l);
-        auto shape_SZ = cute::make_shape(SZ_1st_dim, scale_k, options_l);
-        ElementScale *ptr_S = static_cast<ElementScale *>(arguments->Scale);
-        ElementZero  *ptr_Z = static_cast<ElementZero  *>(arguments->Zero);
+      constexpr bool is_A4B8 = (
+        cutlass::is_same_v<ElementA, cutlass::int4b_t> &&
+        (cutlass::is_same_v<ElementB, cutlass::float_e4m3_t> ||
+         cutlass::is_same_v<ElementB, cutlass::float_e5m2_t>));
+      constexpr bool is_A8B4 = (
+        cutlass::is_same_v<ElementB, cutlass::int4b_t> &&
+        (cutlass::is_same_v<ElementA, cutlass::float_e4m3_t> ||
+         cutlass::is_same_v<ElementA, cutlass::float_e5m2_t>));
+      constexpr bool is_int4_x_fp8 = is_A4B8 || is_A8B4;
 
-        // 1. If arguments is initialized in profiler, S and Z needs to be allocated and filled
-        if (arguments->generate_scale_and_zero) {
-          // Need to fix max_dequant_val and min_dequant_val?
+      // If this is a convert-only kernel, we still need to generate dequantized A or B for verification,
+      // and in this case ElementScale is the same as ElementWide
+      // In int4 * fp8, ElementScale is a cutlass::Array, need to take out it's real element
+      using DummyElementScaleMainloop = std::conditional_t<
+        is_int4_x_fp8,
+        typename cutlass::Array<ElementWide, 8>,
+        ElementWide
+      >;
+      using ElementScaleMainloop = std::conditional_t<
+        has_scale,
+        typename CollectiveMainloop::ElementScale,
+        DummyElementScaleMainloop
+      >;
+      using ElementScale = std::conditional_t<
+        has_scale,
+        typename UnderlyingElement<typename CollectiveMainloop::ElementScale>::type,
+        ElementWide
+      >;
+      using StrideScale = typename CollectiveMainloop::StrideScale;
+      // In ScaleOnly mode, we have allocated the same size of memory for arguments->Z and arguments->S
+      using ElementZero = std::conditional_t<
+        has_zero,
+        typename CollectiveMainloop::ElementZero,
+        ElementScale
+      >;
+      const int SZ_1st_dim = (wider_operand == Sm90MixedInputWiderOperand::A) ? problem_n : problem_m;
+      const size_t SZ_size = static_cast<size_t>(SZ_1st_dim * scale_k * options_l);
+      auto shape_SZ = cute::make_shape(SZ_1st_dim, scale_k, options_l);
+      ElementScale *ptr_S = static_cast<ElementScale *>(arguments->Scale);
+      ElementZero  *ptr_Z = static_cast<ElementZero  *>(arguments->Zero);
+
+      // 1. If arguments is initialized in profiler, S and Z needs to be allocated and filled
+      if (arguments->generate_scale_and_zero) {
+        float scale_min = 1.0f, scale_max = 1.0f;
+        if constexpr(has_scale) {
           const float elt_max_f = float(cutlass::platform::numeric_limits<ElementScale>::max());
+          // Need to fix max_dequant_val and min_dequant_val?
           const float max_dequant_val = elt_max_f * 0.25f;
           const float min_dequant_val = 0.5f;
-          const float scale_max = max_dequant_val / elt_max_f;
-          const float scale_min = min_dequant_val / elt_max_f;
-          uint64_t seed = 2023;
-          cutlass::reference::device::BlockFillRandomUniform(
-            ptr_S, SZ_size, seed, ElementScale(scale_max), ElementScale(scale_min));
+          scale_max = max_dequant_val / elt_max_f;
+          scale_min = min_dequant_val / elt_max_f;
+        }
+        uint64_t seed = 2023;
+        cutlass::reference::device::BlockFillRandomUniform(
+          ptr_S, SZ_size, seed, ElementScale(scale_max), ElementScale(scale_min));
 
-          // In ScaleOnly mode, set Z as zero for generating dequantized A or B
-          const float zero_max = has_zero ?  2.0f : 0.0f;
-          const float zero_min = has_zero ? -2.0f : 0.0f;
-          cutlass::reference::device::BlockFillRandomUniform(
-            ptr_Z, SZ_size, seed, ElementZero(zero_max), ElementZero(zero_min));
-        }  // End of "if (arguments->generate_scale_and_zero)"
+        // In ScaleOnly mode, set Z as zero for generating dequantized A or B
+        const float zero_max = has_zero ?  2.0f : 0.0f;
+        const float zero_min = has_zero ? -2.0f : 0.0f;
+        cutlass::reference::device::BlockFillRandomUniform(
+          ptr_Z, SZ_size, seed, ElementZero(zero_max), ElementZero(zero_min));
+      }  // End of "if (arguments->generate_scale_and_zero)"
 
-        // 2. Generate the dequantized A or B for verification
-        if (arguments->generate_dequantized_AB) {
-          StrideS stride_SZ = cutlass::make_cute_packed_stride(StrideS{}, shape_SZ);
-          auto layout_SZ = cute::make_layout(shape_SZ, stride_SZ);
-          if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
-            if constexpr(is_StrideB_Layout) {
-              // The generator only generates row-major A and col-major B at the moment
-              // Need a way to read out the actual layout of B later
-              using ActualLayoutB = cutlass::layout::ColumnMajor;
-              using ActualStrideB = cutlass::detail::TagToStrideB_t<ActualLayoutB>;
-              dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
-                operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
-            }
-            else {
-              using ActualStrideB = typename CollectiveMainloop::StrideB;
-              dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
-                operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
-            }
+      // 2. Generate the dequantized A or B for verification
+      if (arguments->generate_dequantized_AB) {
+        StrideScale stride_SZ = cutlass::make_cute_packed_stride(StrideScale{}, shape_SZ);
+        auto layout_SZ = cute::make_layout(shape_SZ, stride_SZ);
+        if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+          if constexpr(is_StrideB_Layout) {
+            // The generator only generates row-major A and col-major B at the moment
+            // Need a way to read out the actual layout of B later
+            using ActualLayoutB = cutlass::layout::ColumnMajor;
+            using ActualStrideB = cutlass::detail::TagToStrideB_t<ActualLayoutB>;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
           }
           else {
-            if constexpr(is_StrideA_Layout) {
-              // The generator only generates row-major A and col-major B at the moment
-              // Need a way to read out the actual layout of A later
-              using ActualLayoutA = cutlass::layout::RowMajor;
-              using ActualStrideA = cutlass::detail::TagToStrideA_t<ActualLayoutA>;
-              dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
-                operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
-            }
-            else {
-              using ActualStrideA = typename CollectiveMainloop::StrideA;
-              dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
-                operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
-            }
-          }  // End of "if constexpr(wider_operand == Sm90MixedInputWiderOperand::A)"
-          arguments->dequantized_AB_ready[0] = true;
-        }  // End of "if (arguments->generate_dequantized_AB)"
+            using ActualStrideB = typename CollectiveMainloop::StrideB;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+        }
+        else {
+          if constexpr(is_StrideA_Layout) {
+            // The generator only generates row-major A and col-major B at the moment
+            // Need a way to read out the actual layout of A later
+            using ActualLayoutA = cutlass::layout::RowMajor;
+            using ActualStrideA = cutlass::detail::TagToStrideA_t<ActualLayoutA>;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+          else {
+            using ActualStrideA = typename CollectiveMainloop::StrideA;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+        }  // End of "if constexpr(wider_operand == Sm90MixedInputWiderOperand::A)"
+      }  // End of "if (arguments->generate_dequantized_AB)"
 
-        // 3. Put arguments in mainloop
+      // 3. Put Scale and Zero in mainloop
+      if constexpr(has_scale) {
         if constexpr(is_int4_x_fp8) {
           operator_args.mainloop.ptr_S = static_cast<ElementScaleMainloop const*>(arguments->packed_Scale);
         }
         else {
           operator_args.mainloop.ptr_S = static_cast<ElementScale const*>(arguments->Scale);
         }
-        operator_args.mainloop.dS = cutlass::make_cute_packed_stride(StrideS{}, shape_SZ);
+        operator_args.mainloop.dS = cutlass::make_cute_packed_stride(StrideScale{}, shape_SZ);
         operator_args.mainloop.group_size = options_g;
         if constexpr(has_zero) {
           operator_args.mainloop.ptr_Z = static_cast<ElementZero const*>(arguments->Zero);
@@ -580,7 +595,7 @@ protected:
         handle_shuffle_tensor_<ElementA, ActualStrideA, LayoutA_Reordered, LayoutAtomQuant, wider_operand>(
           operator_args, arguments, problem_m, problem_k, options_l);
       }
-    } // End of "if constexpr(is_mixed_dtype_mainloop_(MainloopPolicy{}))"
+    } // End of "if constexpr(is_sm90_mixed_dtype_mainloop_(MainloopPolicy{}))"
 
     /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
     operator_args.hw_info.sm_count = arguments->sm_count;
@@ -609,7 +624,6 @@ protected:
       operator_args.scheduler.splits = arguments->split_k_slices;
     }
 
-    
     if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
       operator_args.hw_info.cluster_shape = dim3(
         arguments->cluster_shape.m(),
@@ -620,7 +634,6 @@ protected:
         arguments->cluster_shape_fallback.n(),
         arguments->cluster_shape_fallback.k());
     }
-    
     return status;
   }
 
@@ -632,7 +645,7 @@ public:
     GemmUniversalArguments const *arguments =
       static_cast<GemmUniversalArguments const *>(arguments_ptr);
     OperatorArguments args;
-    
+
     auto status = update_arguments_(args, arguments);
     if (status != Status::kSuccess) {
       return status;
diff --git a/tools/library/src/grouped_gemm_operation_3x.hpp b/tools/library/src/grouped_gemm_operation_3x.hpp
index d4b1e26f..e94d7c73 100644
--- a/tools/library/src/grouped_gemm_operation_3x.hpp
+++ b/tools/library/src/grouped_gemm_operation_3x.hpp
@@ -512,8 +512,8 @@ public:
       auto N = get<1>(shape);
       auto K = get<2>(shape);
 
-      auto layout_SFA = CollectiveMainloop::Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
-      auto layout_SFB = CollectiveMainloop::Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+      auto layout_SFA = CollectiveMainloop::Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+      auto layout_SFB = CollectiveMainloop::Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
       layout_SFA_host[group_idx] = layout_SFA;
       layout_SFB_host[group_idx] = layout_SFB;
     }
diff --git a/tools/library/src/reference/block_scaled_gemm_reference_operation.h b/tools/library/src/reference/block_scaled_gemm_reference_operation.h
index e1b9fec8..e0c3a8c1 100644
--- a/tools/library/src/reference/block_scaled_gemm_reference_operation.h
+++ b/tools/library/src/reference/block_scaled_gemm_reference_operation.h
@@ -243,14 +243,14 @@ public:
     auto stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     auto stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    using Sm100BlockScaledConfig = cutlass::detail::Sm100BlockScaledConfig<SFVecSize>;
+    using Sm1xxBlockScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
     auto A = cute::make_tensor(detail::make_iterator(static_cast<ElementA const*>(args.A)),
         cute::make_layout(cute::make_shape(M, K, L), stride_a));
-    auto SfA = make_tensor(static_cast<ElementSFA const*>(args.SFA), Sm100BlockScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
+    auto SfA = make_tensor(static_cast<ElementSFA const*>(args.SFA), Sm1xxBlockScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
 
     auto B = cute::make_tensor(detail::make_iterator(static_cast<ElementB const*>(args.B)),
         cute::make_layout(cute::make_shape(N, K, L), stride_b));
-    auto SfB = make_tensor(static_cast<ElementSFB const*>(args.SFB), Sm100BlockScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
+    auto SfB = make_tensor(static_cast<ElementSFB const*>(args.SFB), Sm1xxBlockScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
 
     auto C = [&]() {
       if constexpr (not is_same_v<ElementC, void>) {
@@ -273,11 +273,11 @@ public:
 
     if constexpr (not is_same_v<ElementSFD, void>) { 
 
-      using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
+      using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
                                               EpilogueSFVecSize
                                             >;
 
-      auto SfD = cute::make_tensor(detail::make_iterator(static_cast<ElementSFD*>(args.SFD)), Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+      auto SfD = cute::make_tensor(detail::make_iterator(static_cast<ElementSFD*>(args.SFD)), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
 
       cutlass::reference::host::GettBlockScalingEpilogueParams<
           ElementCompute, ElementAccumulator, ElementCompute,
diff --git a/tools/library/src/sparse_gemm_operation_3x.hpp b/tools/library/src/sparse_gemm_operation_3x.hpp
index 5c1429c4..01caa11e 100644
--- a/tools/library/src/sparse_gemm_operation_3x.hpp
+++ b/tools/library/src/sparse_gemm_operation_3x.hpp
@@ -36,13 +36,22 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/detail/collective.hpp"
+#include "cutlass/array.h"
+#include "cutlass/array_subbyte.h"
 #include "cutlass/library/library.h"
-#include "cutlass/library/util.h"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
 #include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
 #include "cutlass/util/packed_stride.hpp"        // make_cute_packed_stride
 #include "gemm_operation_3x.hpp"
 #include "library_internal.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cute/tensor.hpp"
+#include <unordered_map>
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -77,6 +86,16 @@ public:
   using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
   using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
 
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
   using ElementE = typename CollectiveMainloop::ElementE;
   using LayoutE = typename CollectiveMainloop::LayoutE;
   using SparseConfig = typename CollectiveMainloop::SparseConfig;
@@ -170,9 +189,41 @@ protected:
       arguments->batch_count);
 
     // update arguments
-    operator_args.mainloop.ptr_A = reinterpret_cast<ElementA const *>(device_a_compressed_ptr);
-    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
-    operator_args.mainloop.ptr_E = reinterpret_cast<ElementE const *>(device_e_ptr);
+
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(device_a_compressed_ptr);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
+          {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2},
+          {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
+          {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
+      };
+
+      auto iter_runtime_a = mapping.find(arguments->runtime_input_datatype_a);
+      auto iter_runtime_b = mapping.find(arguments->runtime_input_datatype_b);
+
+      if (iter_runtime_a != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_a = iter_runtime_a->second;
+      } else {
+        assert("invalid runtime argument for datatype A!");
+      }
+
+      if (iter_runtime_b != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_b = iter_runtime_b->second;
+      } else {
+        assert("invalid runtime argument for datatype B!");
+      }
+
+    }
+    else {
+      operator_args.mainloop.ptr_A = static_cast<ElementA const *>(device_a_compressed_ptr);
+      operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    }
+    operator_args.mainloop.ptr_E = static_cast<ElementE const *>(device_e_ptr);
     operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
     operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
 
@@ -184,7 +235,7 @@ protected:
         arguments->ldc, arguments->batch_stride_C);
     operator_args.epilogue.dD = operator_args.epilogue.dC;
 
-    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
     operator_args.hw_info.sm_count = arguments->sm_count;
     if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
       operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
@@ -208,6 +259,16 @@ protected:
       operator_args.scheduler.splits = arguments->split_k_slices;
     }
 
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
     return status;
   }
 
diff --git a/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h
index f0e73cc5..08e28db2 100644
--- a/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h
@@ -37,6 +37,7 @@
 #pragma once
 
 #include <vector>
+#include <array>
 #include <string>
 #include <memory>
 #include <algorithm>
@@ -71,6 +72,14 @@ public:
 
     cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
 
+    /// For profiling purposes
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<std::array<int64_t, 3>> leading_dims;
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+
     int64_t m{16};
     int64_t n{16};
     int64_t k{16};
@@ -119,6 +128,14 @@ public:
       ProblemSpace const &problem_space,
       ProblemSpace::Problem const &problem);
 
+    int64_t bytes_with_problem_shape(
+      library::BlockScaledGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    int64_t flops_with_problem_shape(
+      library::BlockScaledGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
     /// Total number of bytes loaded
     int64_t bytes(library::BlockScaledGemmDescription const &operation_desc) const;
 
@@ -239,6 +256,27 @@ public:
 
 protected:
 
+  /// Update workspace configuration according to flexible user setups
+  void update_workspace_(
+    GemmWorkspace &gemm_workspace,
+    gemm::GemmCoord const &problem_shape,
+    std::array<int64_t, 3> const &leading_dim,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    cutlass::library::RasterOrder const &raster_order,
+    int swizzle_size);
+
+  /// Update performance result configuration according to flexible user setups
+  void update_result_(
+    PerformanceResult &result,
+    library::BlockScaledGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space,
+    gemm::GemmCoord const &problem_shape,
+    cutlass::library::RasterOrder const &raster_order,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    int swizzle_size);
+
   /// Initializes the performance result
   void initialize_result_(
     PerformanceResult &result,
diff --git a/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
index 4e5693a8..489dc5af 100644
--- a/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
@@ -35,6 +35,7 @@
 #pragma once
 
 #include <vector>
+#include <array>
 #include <string>
 #include <memory>
 #include <algorithm>
@@ -69,6 +70,14 @@ public:
 
     cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
 
+    /// For profiling purposes
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<std::array<int64_t, 3>> leading_dims;
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+
     int64_t m{16};
     int64_t n{16};
     int64_t k{16};
@@ -120,6 +129,14 @@ public:
       ProblemSpace const &problem_space,
       ProblemSpace::Problem const &problem);
 
+    int64_t bytes_with_problem_shape(
+      library::GemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    int64_t flops_with_problem_shape(
+      library::GemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
     /// Total number of bytes loaded
     int64_t bytes(library::GemmDescription const &operation_desc) const;
 
@@ -243,6 +260,26 @@ public:
     ProblemSpace::Problem const &problem);
 
 protected:
+  /// Update workspace configuration according to flexible user setups
+  void update_workspace_(
+    GemmWorkspace &gemm_workspace,
+    gemm::GemmCoord const &problem_shape,
+    std::array<int64_t, 3> const &leading_dim,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    cutlass::library::RasterOrder const &raster_order,
+    int swizzle_size);
+
+  /// Update performance result configuration according to flexible user setups
+  void update_result_(
+    PerformanceResult &result,
+    library::GemmDescription const &operation_desc,
+    ProblemSpace const &problem_space,
+    gemm::GemmCoord const &problem_shape,
+    cutlass::library::RasterOrder const &raster_order,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    int swizzle_size);
 
   /// Initializes the performance result
   void initialize_result_(
diff --git a/tools/profiler/include/cutlass/profiler/options.h b/tools/profiler/include/cutlass/profiler/options.h
index 19c9ea6a..28b1e2f8 100644
--- a/tools/profiler/include/cutlass/profiler/options.h
+++ b/tools/profiler/include/cutlass/profiler/options.h
@@ -208,8 +208,23 @@ public:
     /// Minimum number of iterations to profile
     int min_iterations{10};
 
+    /// If true, profiling with cuda graph enabled.
     bool use_cuda_graphs{false};
 
+    /// If enabled, the CUTLASS profiler searches for the best-performing kernel 
+    /// within the subset of kernels matching a kernel filter regex. The best 
+    /// performance is determined by screening over a set of predefined M/N/K 
+    /// sizes and performance-related parameters, including cluster shapes, 
+    /// swizzle sizes, and rasterization orders.
+    /// For now, it only supports legacy GEMM and blockscaled GEMM.
+    bool enable_kernel_performance_search{false};
+
+    /// If enabled, the CUTLASS profiler searches for the best-performing kernel 
+    /// for a given M/N/K problem size by evaluating various performance-related 
+    /// parameters such as cluster shapes, swizzle sizes, and rasterization orders.
+    /// For now, it only supports legacy GEMM and blockscaled GEMM.
+    bool enable_best_kernel_for_fixed_shape{false};
+
     /// Number of ms to sleep between profiling periods (ms)
     int sleep_duration{50};
 
@@ -264,8 +279,11 @@ public:
     /// Prints human-readable text to stdout. If false, nothing is written to stdout
     bool verbose;
 
-    /// Sort results by (currently by flops-per-byte)
-    bool sort_results;
+    /// Sort results by flops-per-byte
+    bool sort_flops_per_byte;
+
+    /// Sort results by flops-per-second
+    bool sort_flops_per_sec;
 
     /// Prints the name of the kernel being profiled before running the kernel.
     /// This is useful for determining which kernel is causing a run of the profiler to hang
diff --git a/tools/profiler/include/cutlass/profiler/performance_report.h b/tools/profiler/include/cutlass/profiler/performance_report.h
index 5228ff42..07102c99 100644
--- a/tools/profiler/include/cutlass/profiler/performance_report.h
+++ b/tools/profiler/include/cutlass/profiler/performance_report.h
@@ -92,7 +92,8 @@ public:
 
   void next_problem();
   void append_result(PerformanceResult result);
-  void sort_results(PerformanceResultVector &results);
+  void sort_flops_per_byte(PerformanceResultVector &results);
+  void sort_flops_per_sec(PerformanceResultVector &results);
   void append_results(PerformanceResultVector const &results);
 
 public:
diff --git a/tools/profiler/include/cutlass/profiler/performance_result.h b/tools/profiler/include/cutlass/profiler/performance_result.h
index ce1aba7e..986ac89b 100644
--- a/tools/profiler/include/cutlass/profiler/performance_result.h
+++ b/tools/profiler/include/cutlass/profiler/performance_result.h
@@ -105,6 +105,12 @@ struct PerformanceResult {
     runtime(0)
   { }
 
+  // Copy constructor for deep copy
+  PerformanceResult(const PerformanceResult& other) = default;
+
+  // Explicitly define copy assignment operator
+  PerformanceResult& operator=(const PerformanceResult& other) = default;
+
   /// Returns true if the runtime is valid
   bool good() const {
     return runtime > 0;
diff --git a/tools/profiler/src/block_scaled_gemm_operation_profiler.cu b/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
index d3b0b6bd..33284185 100644
--- a/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
+++ b/tools/profiler/src/block_scaled_gemm_operation_profiler.cu
@@ -174,7 +174,6 @@ Status BlockScaledGemmOperationProfiler::GemmProblem::parse(
     this->k = 1024;
   }
 
-  
   if (!arg_as_int(this->cluster_m, "cluster_m", problem_space, problem)) {
     // default value
     this->cluster_m = 1;
@@ -204,7 +203,6 @@ Status BlockScaledGemmOperationProfiler::GemmProblem::parse(
     // default value
     this->cluster_k_fallback = 0;
   }
-  
 
   if (!arg_as_SplitKModeID(this->split_k_mode, "split_k_mode", problem_space, problem)) {
     // default value
@@ -314,23 +312,61 @@ Status BlockScaledGemmOperationProfiler::GemmProblem::parse(
   this->ldc = DeviceAllocation::get_packed_layout(
     operation_desc.C.layout, {int(this->m), int(this->n)}).front();
 
+  // instantiation for deep profiling
+  int num_sizes = 8;
+  this->problem_sizes.resize(num_sizes);
+  this->leading_dims.resize(num_sizes, {0, 0, 0});
+
+  int m0 = 1024;
+  int n0 = 1024;
+  int k0 = 1024;
+  for (int i = 0; i < num_sizes; i++) {
+    auto m = m0 * (i + 3);
+    auto n = n0 * (i + 3);
+    auto k = k0 * (i + 3);
+    this->problem_sizes[i] = {m, n, k};
+    this->leading_dims[i] = {
+      DeviceAllocation::get_packed_layout(operation_desc.A.layout, {int(m), int(k)}).front(),
+      DeviceAllocation::get_packed_layout(operation_desc.B.layout, {int(k), int(n)}).front(),
+      DeviceAllocation::get_packed_layout(operation_desc.C.layout, {int(m), int(n)}).front()
+    };
+
+  }
+
+  this->raster_orders = {
+    cutlass::library::RasterOrder::kAlongN,
+    cutlass::library::RasterOrder::kAlongM
+  };
+
+  this->swizzle_sizes = {1, 2, 4, 8};
+
+  this->preferred_clusters = {
+    {1, 1, 1}, {2, 1, 1}, {2, 2, 1}, {4, 1, 1}, {4, 2, 1}, {4, 4, 1}, {8, 2, 1}
+  };
+
+  this->fallback_clusters = {
+    {1, 1, 1}, {2, 1, 1}, {2, 2, 1}
+  };
+
   return Status::kSuccess;
 }
 
 /// Total number of bytes loaded
-int64_t BlockScaledGemmOperationProfiler::GemmProblem::bytes(library::BlockScaledGemmDescription const &operation_desc) const {
+int64_t BlockScaledGemmOperationProfiler::GemmProblem::bytes_with_problem_shape(
+  library::BlockScaledGemmDescription const &operation_desc,
+  gemm::GemmCoord const &problem_shape) const {
   // Input bytes read and Output bytes written for the gemm problem
   int64_t bytes =
-    int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * k +
-    int64_t(library::sizeof_bits(operation_desc.B.element) * n / 8) * k +
-    int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+    int64_t(library::sizeof_bits(operation_desc.A.element) * problem_shape.m() / 8) * problem_shape.k() +
+    int64_t(library::sizeof_bits(operation_desc.B.element) * problem_shape.n() / 8) * problem_shape.k() +
+    int64_t(library::sizeof_bits(operation_desc.C.element) * problem_shape.m() / 8) * problem_shape.n();
 
   // Set is_beta_zero true if beta is zero
   bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
 
   // Output bytes read for the gemm problem for non-zero beta values
   if (!is_beta_zero) {
-    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * problem_shape.m() / 8) * problem_shape.n();
   }
 
   bytes *= batch_count;
@@ -338,9 +374,16 @@ int64_t BlockScaledGemmOperationProfiler::GemmProblem::bytes(library::BlockScale
   return bytes;
 }
 
+int64_t BlockScaledGemmOperationProfiler::GemmProblem::bytes(library::BlockScaledGemmDescription const &operation_desc) const {
+  gemm::GemmCoord problem_shape({int(m), int(n), int(k)});
+  return bytes_with_problem_shape(operation_desc, problem_shape);
+}
+
 /// Total number of flops computed
-int64_t BlockScaledGemmOperationProfiler::GemmProblem::flops(library::BlockScaledGemmDescription const &operation_desc) const {
-  int64_t flops_ = (int64_t(m) * n * k + m * n) * 2 * batch_count;
+int64_t BlockScaledGemmOperationProfiler::GemmProblem::flops_with_problem_shape(
+  library::BlockScaledGemmDescription const &operation_desc,
+  gemm::GemmCoord const &problem_shape) const {
+  int64_t flops_ = (int64_t(problem_shape.m()) * problem_shape.n() * problem_shape.k() + problem_shape.m() * problem_shape.n()) * 2 * batch_count;
 
   // complex-valued support
   switch (operation_desc.tile_description.math_instruction.math_operation) {
@@ -362,6 +405,10 @@ int64_t BlockScaledGemmOperationProfiler::GemmProblem::flops(library::BlockScale
   return flops_;
 }
 
+int64_t BlockScaledGemmOperationProfiler::GemmProblem::flops(library::BlockScaledGemmDescription const &operation_desc) const {
+  gemm::GemmCoord problem_shape({int(m), int(n), int(k)});
+  return flops_with_problem_shape(operation_desc, problem_shape);
+}
 
 /// Initializes a performance result
 void BlockScaledGemmOperationProfiler::GemmProblem::initialize_result(
@@ -480,8 +527,8 @@ Status BlockScaledGemmOperationProfiler::initialize_configuration(
   gemm_workspace_.arguments.swizzle_size = problem_.swizzle_size;
   gemm_workspace_.arguments.raster_order = problem_.raster_order;
   gemm_workspace_.arguments.norm_constant = 0;                    
-  gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
-  gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+  gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)};
+  gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)};
   gemm_workspace_.arguments.split_k_slices = problem_.split_k_slices;
 
   
@@ -499,8 +546,74 @@ Status BlockScaledGemmOperationProfiler::initialize_configuration(
   }
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-
   return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
+  
+}
+
+void BlockScaledGemmOperationProfiler::update_workspace_(
+  GemmWorkspace &gemm_workspace,
+  gemm::GemmCoord const &problem_shape,
+  std::array<int64_t, 3> const &leading_dim,
+  std::array<int64_t, 3> const &preferred_cluster,
+  std::array<int64_t, 3> const &fallback_cluster,
+  cutlass::library::RasterOrder const &raster_order,
+  int swizzle_size
+) {
+
+  gemm_workspace.arguments.problem_size.m() = problem_shape.m();
+  gemm_workspace.arguments.problem_size.n() = problem_shape.n();
+  gemm_workspace.arguments.problem_size.k() = problem_shape.k();
+
+  gemm_workspace.arguments.lda = leading_dim[0];
+  gemm_workspace.arguments.ldb = leading_dim[1];
+  gemm_workspace.arguments.ldc = leading_dim[2];
+
+  gemm_workspace.arguments.swizzle_size = swizzle_size;
+  gemm_workspace.arguments.raster_order = raster_order;
+
+  gemm_workspace.arguments.cluster_shape = {int(preferred_cluster[0]), int(preferred_cluster[1]), int(preferred_cluster[2])};
+  gemm_workspace.arguments.cluster_shape_fallback = {int(fallback_cluster[0]), int(fallback_cluster[1]), int(fallback_cluster[2])};
+
+  gemm_workspace.configuration.problem_size.m() = problem_shape.m();
+  gemm_workspace.configuration.problem_size.n() = problem_shape.n();
+  gemm_workspace.configuration.problem_size.k() = problem_shape.k();
+
+  gemm_workspace.configuration.cluster_shape = {int(preferred_cluster[0]), int(preferred_cluster[1]), int(preferred_cluster[2])};
+  gemm_workspace.configuration.cluster_shape_fallback = {int(fallback_cluster[0]), int(fallback_cluster[1]), int(fallback_cluster[2])};
+
+  gemm_workspace.configuration.lda = leading_dim[0];
+  gemm_workspace.configuration.ldb = leading_dim[1];
+  gemm_workspace.configuration.ldc = leading_dim[2];
+
+}
+
+
+void BlockScaledGemmOperationProfiler::update_result_(
+  PerformanceResult &result,
+  library::BlockScaledGemmDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  gemm::GemmCoord const &problem_shape,
+  cutlass::library::RasterOrder const &raster_order,
+  std::array<int64_t, 3> const &preferred_cluster,
+  std::array<int64_t, 3> const &fallback_cluster,
+  int swizzle_size
+) {
+  result.bytes = problem_.bytes_with_problem_shape(operation_desc, problem_shape);
+  result.flops = problem_.flops_with_problem_shape(operation_desc, problem_shape);
+  set_argument(result, "m", problem_space, problem_shape.m());
+  set_argument(result, "n", problem_space, problem_shape.n());
+  set_argument(result, "k", problem_space, problem_shape.k());
+
+  set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
+  set_argument(result, "swizzle_size", problem_space, swizzle_size);
+
+  set_argument(result, "cluster_m", problem_space, preferred_cluster[0]);
+  set_argument(result, "cluster_n", problem_space, preferred_cluster[1]);
+  set_argument(result, "cluster_k", problem_space, preferred_cluster[2]);
+  set_argument(result, "cluster_m_fallback", problem_space, fallback_cluster[0]);
+  set_argument(result, "cluster_n_fallback", problem_space, fallback_cluster[1]);
+  set_argument(result, "cluster_k_fallback", problem_space, fallback_cluster[2]);
+
 }
 
 /// Initializes the performance result
@@ -619,27 +732,38 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
 
   bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
   if (allocate_device_tensors) {
+    bool enable_deep_profiling = options.profiling.enable_kernel_performance_search;
     int seed_shift = 0;
+
+    // When exhaustive performance search (deep profiling) option is enabled, device buffers are initialized to the largest problem shape
+    // so that later performance search can re-use those buffers.
+    int init_m = enable_deep_profiling ? std::max(int(problem_.m),  problem_.problem_sizes.back().m()) : int(problem_.m);
+    int init_n = enable_deep_profiling ? std::max(int(problem_.n),  problem_.problem_sizes.back().n()) : int(problem_.n);
+    int init_k = enable_deep_profiling ? std::max(int(problem_.k),  problem_.problem_sizes.back().k()) : int(problem_.k);
+    int init_lda = enable_deep_profiling ? int(std::max(problem_.lda,  problem_.leading_dims.back()[0])) : int(problem_.lda);
+    int init_ldb = enable_deep_profiling ? int(std::max(problem_.ldb,  problem_.leading_dims.back()[1])) : int(problem_.ldb);
+    int init_ldc = enable_deep_profiling ? int(std::max(problem_.ldc,  problem_.leading_dims.back()[2])) : int(problem_.ldc);
+
     gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
-      {int(problem_.m), int(problem_.k)},
-      {int(problem_.lda)},
+      {init_m, init_k},
+      {init_lda},
       problem_.batch_count * gemm_workspace_.problem_count,
       seed_shift++,
       0 // device_index
     );
 
-    int sfa_m     = round_up(int(problem_.m), 128);
-    int sfb_n     = round_up(int(problem_.n), 128);
-    int sfa_sfb_k = round_up(ceil_div(int(problem_.k), operation_desc.SFVecSize), 4);
+    int sfa_m     = round_up(init_m, 128);
+    int sfb_n     = round_up(init_n, 128);
+    int sfa_sfb_k = round_up(ceil_div(init_k, operation_desc.SFVecSize), 4);
     
     int sfd_m     = operation_desc.SFD.layout == cutlass::library::LayoutTypeID::kRowMajor ?
-                      sfa_m : round_up(ceil_div(int(problem_.m), operation_desc.EpilogueSFVecSize), 4);
+                      sfa_m : round_up(ceil_div(init_m, operation_desc.EpilogueSFVecSize), 4);
     int sfd_n     = operation_desc.SFD.layout == cutlass::library::LayoutTypeID::kRowMajor ?
-                      round_up(ceil_div(int(problem_.n), operation_desc.EpilogueSFVecSize), 4) : sfb_n;
+                      round_up(ceil_div(init_n, operation_desc.EpilogueSFVecSize), 4) : sfb_n;
     
 
     gemm_workspace_.SFA = device_context.allocate_and_initialize_tensor(
@@ -671,8 +795,8 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
-      {int(problem_.k), int(problem_.n)},
-      {int(problem_.ldb)},
+      {init_k, init_n},
+      {init_ldb},
       problem_.batch_count * gemm_workspace_.problem_count,
       seed_shift++,
       0 // device_index
@@ -683,8 +807,8 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
+      {init_m, init_n},
+      {init_ldc},
       problem_.batch_count * gemm_workspace_.problem_count,
       seed_shift++,
       0 // device_index
@@ -695,8 +819,8 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
       "D",
       operation_desc.D.element,
       operation_desc.D.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
+      {init_m, init_n},
+      {init_ldc},
       problem_.batch_count * gemm_workspace_.problem_count,
       0 // device_index
     );
@@ -706,8 +830,8 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
       "Reference",
       operation_desc.D.element,
       operation_desc.D.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
+      {init_m, init_n},
+      {init_ldc},
       problem_.batch_count * gemm_workspace_.problem_count,
       0 // device_index
     );
@@ -766,8 +890,8 @@ Status BlockScaledGemmOperationProfiler::initialize_workspace(
 
     // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
     gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
-    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
-    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)};
+    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)};
     gemm_workspace_.arguments.split_k_slices = problem_.split_k_slices;
     gemm_workspace_.arguments.batch_count = problem_.batch_count;
     gemm_workspace_.arguments.lda = problem_.lda;
@@ -1247,42 +1371,186 @@ bool BlockScaledGemmOperationProfiler::profile(
 
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
-    // Initialize structure containing GEMM arguments
-    gemm_workspace_.arguments.A = gemm_workspace_.A->data();
-    gemm_workspace_.arguments.B = gemm_workspace_.B->data();
-    gemm_workspace_.arguments.SFA = gemm_workspace_.SFA->data();
-    gemm_workspace_.arguments.SFB = gemm_workspace_.SFB->data();
-    gemm_workspace_.arguments.C = gemm_workspace_.C->data();
-    gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
-    gemm_workspace_.arguments.alpha = problem_.alpha.data();
-    gemm_workspace_.arguments.beta = problem_.beta.data();
-    gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-    gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
-    gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
-    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
-    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+    if (options.profiling.enable_kernel_performance_search || options.profiling.enable_best_kernel_for_fixed_shape) {
+      library::BlockScaledGemmDescription const &operation_desc =
+        static_cast<library::BlockScaledGemmDescription const &>(operation->description());
 
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
-      gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
+      auto min_cc = operation_desc.tile_description.minimum_compute_capability;
 
-      gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
-      gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
-      gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
-      gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
-      gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+      bool is_dynamic_cluster_enabled = (min_cc >= 100);
+
+      // Helper function wrapping up performance test with flexible parameters.
+      auto initialize_and_profile = [&](
+        PerformanceResult const &result,
+        gemm::GemmCoord const &problem_shape,
+        std::array<int64_t, 3> const &leading_dim,
+        std::array<int64_t, 3> const &preferred_cluster,
+        std::array<int64_t, 3> const &fallback_cluster,
+        cutlass::library::RasterOrder const &raster_order,
+        int swizzle_size) -> std::optional<PerformanceResult> {
+
+        // Initialize structure containing GEMM arguments
+        gemm_workspace_.arguments.A = gemm_workspace_.A->data();
+        gemm_workspace_.arguments.B = gemm_workspace_.B->data();
+        gemm_workspace_.arguments.SFA = gemm_workspace_.SFA->data();
+        gemm_workspace_.arguments.SFB = gemm_workspace_.SFB->data();
+        gemm_workspace_.arguments.C = gemm_workspace_.C->data();
+        gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
+        gemm_workspace_.arguments.alpha = problem_.alpha.data();
+        gemm_workspace_.arguments.beta = problem_.beta.data();
+        gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+        gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
+        gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
+        gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
+        gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+
+        if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+          gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
+          gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
+          gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
+
+          gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
+          gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
+          gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
+          gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
+          gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
+          gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+        }
+
+        update_workspace_(gemm_workspace_, problem_shape, leading_dim, preferred_cluster, fallback_cluster, raster_order, swizzle_size);
+
+        const auto can_implement = operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
+        if (can_implement != Status::kSuccess) {
+          return std::nullopt;  // Return nullopt to indicate failure
+        }
+        library::Operation const* underlying_operation = operation;
+
+        uint64_t workspace_size = underlying_operation->get_host_workspace_size(&gemm_workspace_.configuration);
+        gemm_workspace_.host_workspace.resize(workspace_size, 0);
+
+        workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
+                                                              &gemm_workspace_.arguments);
+        
+        gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+        Status status = Status::kSuccess;
+
+        status = underlying_operation->initialize(
+          &gemm_workspace_.configuration,
+          gemm_workspace_.host_workspace.data(),
+          gemm_workspace_.device_workspace.data());
+
+        if (status != Status::kSuccess) {
+          return std::nullopt;  // Return nullopt to indicate failure
+        }
+
+        PerformanceResult curr_result(result);
+        update_result_(curr_result, operation_desc, problem_space, problem_shape, raster_order, preferred_cluster, fallback_cluster, swizzle_size);
+
+        curr_result.status = profile_cutlass_(
+          curr_result,
+          options,
+          operation,
+          &gemm_workspace_.arguments,
+          gemm_workspace_.host_workspace.data(),
+          gemm_workspace_.device_workspace.data()
+        );
+
+        return curr_result;
+      };
+
+
+      // Helper function to test validity of fallback cluster shapes and preferred cluster shapes.
+      auto is_valid_dynamic_cluster_shape = [](const std::array<int64_t, 3>& preferred_cluster, const std::array<int64_t, 3>& fallback_cluster) {
+        for (size_t i = 0; i < 3; ++i) {
+          if (preferred_cluster[i] % fallback_cluster[i] != 0) {
+            return false;
+          }
+        }
+        return true;
+      };
+
+
+      // Helper function to select the best performance number among a list.
+      auto select_best_candidate = [&](std::vector<PerformanceResult> &candidates) {
+        assert(!candidates.empty() && "Candidates vector should not be empty");
+        auto best_iter = std::max_element(
+          candidates.begin(), candidates.end(),
+          [](PerformanceResult const &a, PerformanceResult const &b) {
+            return a.gflops_per_sec() < b.gflops_per_sec();
+          }
+        );
+        assert(best_iter != candidates.end() && "No candidate found despite non-empty candidates vector");
+        results_.push_back(std::move(*best_iter));
+      };
+
+      std::vector<PerformanceResult> candidates;
+      PerformanceResult result_base = results_.back();
+      results_.pop_back();
+
+      bool dynamic_cluster = int64_t(operation_desc.tile_description.cluster_shape.m()) == 0 ||
+                             int64_t(operation_desc.tile_description.cluster_shape.n()) == 0 ||
+                             int64_t(operation_desc.tile_description.cluster_shape.k()) == 0;
+
+      std::vector<std::array<int64_t, 3>> preferred_clusters;
+      std::vector<std::array<int64_t, 3>> fallback_clusters;
+
+      // Only loop over built-in cluster shape lists for dynamic cluster kernels
+      // and for kernels that can leverage the dynamic cluster feature.
+      if (dynamic_cluster && is_dynamic_cluster_enabled) {
+        preferred_clusters = this->problem_.preferred_clusters;
+        fallback_clusters = this->problem_.fallback_clusters;
+      } 
+      else {
+        preferred_clusters = {{int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}};
+        fallback_clusters = {{int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}};
+      }
+
+      for (auto preferred_cluster : preferred_clusters) {
+        for (auto fallback_cluster : fallback_clusters) {
+          if (dynamic_cluster && !is_valid_dynamic_cluster_shape(preferred_cluster, fallback_cluster)) {
+            continue;
+          }
+          for (auto swizzle_size : this->problem_.swizzle_sizes) {
+            for (auto raster_order : this->problem_.raster_orders) {
+              // With the fixed shape option turned on, only a specific problem shape is tested.
+              if (options.profiling.enable_best_kernel_for_fixed_shape) {
+                this->problem_.problem_sizes = {{int(this->problem_.m), int(this->problem_.n), int(this->problem_.k)}};
+                this->problem_.leading_dims = {{this->problem_.lda, this->problem_.ldb, this->problem_.ldc}};
+              }
+
+              for (int i = 0; i < int(this->problem_.problem_sizes.size()); i++) {
+                gemm::GemmCoord problem_shape = problem_.problem_sizes[i];
+                std::array<int64_t, 3> leading_dim = problem_.leading_dims[i];
+                auto result_opt = initialize_and_profile(result_base, problem_shape, leading_dim, preferred_cluster, fallback_cluster, raster_order, swizzle_size);
+                  
+                if (result_opt) {  // Only add valid results
+                  candidates.push_back(*result_opt);
+                }
+
+              }
+            }// for raster_order
+          }// for swizzle_size
+        }// for fallback_cluster
+      }// for swizzle_size
+
+      if (candidates.empty()) {
+        return false;
+      }
+      select_best_candidate(candidates);
+
+    }
+    else {
+      results_.back().status = profile_cutlass_(
+        results_.back(),
+        options,
+        operation,
+        &gemm_workspace_.arguments,
+        gemm_workspace_.host_workspace.data(),
+        gemm_workspace_.device_workspace.data()
+      );
     }
 
-    results_.back().status = profile_cutlass_(
-      results_.back(),
-      options,
-      operation,
-      &gemm_workspace_.arguments,
-      gemm_workspace_.host_workspace.data(),
-      gemm_workspace_.device_workspace.data()
-    );
   }
   return true;
 }
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 39625750..779cfdb1 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -53,7 +53,6 @@
 namespace cutlass {
 namespace profiler {
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
@@ -175,7 +174,6 @@ Status GemmOperationProfiler::GemmProblem::parse(
     this->k = 1024;
   }
 
-  
   if (!arg_as_int(this->cluster_m, "cluster_m", problem_space, problem)) {
     // default value
     this->cluster_m = 1;
@@ -205,7 +203,6 @@ Status GemmOperationProfiler::GemmProblem::parse(
     // default value
     this->cluster_k_fallback = 0;
   }
-  
 
   if (!arg_as_bool(this->use_pdl, "use_pdl", problem_space, problem)) {
     // default value
@@ -315,33 +312,80 @@ Status GemmOperationProfiler::GemmProblem::parse(
   this->ldc = DeviceAllocation::get_packed_layout(
     operation_desc.C.layout, {int(this->m), int(this->n)}).front();
 
+  // instantiation
+  int num_sizes = 8;
+  this->problem_sizes.resize(num_sizes);
+  this->leading_dims.resize(num_sizes, {0, 0, 0});
+    
+  int m0 = 1024;
+  int n0 = 1024;
+  int k0 = 1024;
+  for (int i = 0; i < num_sizes; i++) {
+    auto m = m0 * (i + 1);
+    auto n = n0 * (i + 1);
+    auto k = k0 * (i + 1);
+    this->problem_sizes[i] = {m, n, k};
+    this->leading_dims[i] = {
+      DeviceAllocation::get_packed_layout(operation_desc.A.layout, {int(m), int(k)}).front(),
+      DeviceAllocation::get_packed_layout(operation_desc.B.layout, {int(k), int(n)}).front(),
+      DeviceAllocation::get_packed_layout(operation_desc.C.layout, {int(m), int(n)}).front()
+    };
+
+  }
+
+  this->raster_orders = {
+    cutlass::library::RasterOrder::kAlongN,
+    cutlass::library::RasterOrder::kAlongM
+  };
+
+  this->swizzle_sizes = {1, 2, 4, 8};
+
+  this->preferred_clusters = {
+    {1, 1, 1}, {2, 1, 1}, {2, 2, 1}, {4, 1, 1}, {4, 2, 1}, {4, 4, 1}, {8, 2, 1}
+  };
+
+  this->fallback_clusters = {
+    {1, 1, 1}, {2, 1, 1}, {2, 2, 1}
+  };
+
   return Status::kSuccess;
 }
 
-/// Total number of bytes loaded
-int64_t GemmOperationProfiler::GemmProblem::bytes(library::GemmDescription const &operation_desc) const {
+int64_t GemmOperationProfiler::GemmProblem::bytes_with_problem_shape(
+  library::GemmDescription const &operation_desc,
+  gemm::GemmCoord const &problem_shape) const {
+
   // Input bytes read and Output bytes written for the gemm problem
   int64_t bytes =
-    int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * k +
-    int64_t(library::sizeof_bits(operation_desc.B.element) * n / 8) * k +
-    int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+    int64_t(library::sizeof_bits(operation_desc.A.element) * problem_shape.m() / 8) * problem_shape.k() +
+    int64_t(library::sizeof_bits(operation_desc.B.element) * problem_shape.n() / 8) * problem_shape.k() +
+    int64_t(library::sizeof_bits(operation_desc.C.element) * problem_shape.m() / 8) * problem_shape.n();
 
   // Set is_beta_zero true if beta is zero
   bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
 
   // Output bytes read for the gemm problem for non-zero beta values
   if (!is_beta_zero) {
-    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * problem_shape.m() / 8) * problem_shape.n();
   }
 
   bytes *= batch_count;
 
   return bytes;
+
+}
+
+/// Total number of bytes loaded
+int64_t GemmOperationProfiler::GemmProblem::bytes(library::GemmDescription const &operation_desc) const {
+  gemm::GemmCoord problem_shape({int(m), int(n), int(k)});
+  return bytes_with_problem_shape(operation_desc, problem_shape);
 }
 
 /// Total number of flops computed
-int64_t GemmOperationProfiler::GemmProblem::flops(library::GemmDescription const &operation_desc) const {
-  int64_t flops_ = (int64_t(m) * n * k + m * n) * 2 * batch_count;
+int64_t GemmOperationProfiler::GemmProblem::flops_with_problem_shape(
+  library::GemmDescription const &operation_desc,
+  gemm::GemmCoord const &problem_shape) const {
+  int64_t flops_ = (int64_t(problem_shape.m()) * problem_shape.n() * problem_shape.k() + problem_shape.m() * problem_shape.n()) * 2 * batch_count;
 
   // complex-valued support
   switch (operation_desc.tile_description.math_instruction.math_operation) {
@@ -363,6 +407,12 @@ int64_t GemmOperationProfiler::GemmProblem::flops(library::GemmDescription const
   return flops_;
 }
 
+/// Total number of flops computed
+int64_t GemmOperationProfiler::GemmProblem::flops(library::GemmDescription const &operation_desc) const {
+  gemm::GemmCoord problem_shape({int(m), int(n), int(k)});
+  return flops_with_problem_shape(operation_desc, problem_shape);
+}
+
 
 /// Initializes a performance result
 void GemmOperationProfiler::GemmProblem::initialize_result(
@@ -440,8 +490,9 @@ Status GemmOperationProfiler::initialize_configuration(
   Status status = problem_.parse(operation_desc, problem_space, problem);
 
   // Note: this is a temporary workaround
-  bool is_current_operation_sm90_mixed_dtype_shuffle = (strstr(operation_desc.name, "_shfl") != NULL);
-  if (is_current_operation_sm90_mixed_dtype_shuffle && (problem_.enable_sm90_mixed_dtype_shuffle_test == false)) {
+  bool is_sm90_operation = (strstr(operation_desc.name, "_sm90") != NULL);
+  bool is_sm90_mixed_dtype_shuffle_operation = (strstr(operation_desc.name, "_shfl") != NULL);
+  if (is_sm90_mixed_dtype_shuffle_operation && (problem_.enable_sm90_mixed_dtype_shuffle_test == false)) {
     return Status::kErrorInvalidProblem;
   }
 
@@ -457,7 +508,7 @@ Status GemmOperationProfiler::initialize_configuration(
   library::NumericTypeID b_elem = library::get_real_type(operation_desc.B.element);
   int a_elem_bits = library::sizeof_bits(a_elem);
   int b_elem_bits = library::sizeof_bits(b_elem);
-  bool is_mixed_input = (a_elem_bits != b_elem_bits);
+  bool is_sm90_mixed_dtype_operation = is_sm90_operation && (a_elem_bits != b_elem_bits);
 
   for (size_t i = 0; i < device_count; ++i) {
     cudaSetDevice(options.device.device_id(i));
@@ -519,7 +570,7 @@ Status GemmOperationProfiler::initialize_configuration(
     
 
     initialize_result_(this->model_result_, options, operation_desc, problem_space);
-    if (is_mixed_input)
+    if (is_sm90_mixed_dtype_operation)
     {
       const int options_g = problem_.k;
       const int options_l = problem_.batch_count;
@@ -577,16 +628,14 @@ Status GemmOperationProfiler::initialize_configuration(
       // Here is the first touch of the arguments, mark the mixed dtype,
       // populate the scale and zero tensors in the following can_implement() call later.
       // A and B are not populated at this moment, so do not update the dequantized A or B
-      gemm_workspace_[i].arguments.is_mixed_dtype = true;
+      gemm_workspace_[i].arguments.is_sm90_mixed_dtype = true;
       gemm_workspace_[i].arguments.wider_operand = (a_elem_bits > b_elem_bits) ? cutlass::library::Sm90MixedInputWiderOperand::A : cutlass::library::Sm90MixedInputWiderOperand::B;
       gemm_workspace_[i].arguments.generate_scale_and_zero = true;
       gemm_workspace_[i].arguments.generate_dequantized_AB = false;
-      gemm_workspace_[i].arguments.dequantized_AB_ready = (bool *) malloc(sizeof(bool));
-      gemm_workspace_[i].arguments.dequantized_AB_ready[0] = false;
       gemm_workspace_[i].arguments.Scale = gemm_workspace_[i].Scale->data();
       gemm_workspace_[i].arguments.Zero = gemm_workspace_[i].Zero->data();
       gemm_workspace_[i].arguments.packed_Scale = gemm_workspace_[i].packed_Scale->data();
-    }  // End of "if (is_mixed_input)"
+    }  // End of "if (is_sm90_mixed_dtype_operation)"
 
     const auto can_implement = operation->can_implement(&gemm_workspace_[i].configuration, &gemm_workspace_[i].arguments);
     if (can_implement != Status::kSuccess) {
@@ -604,6 +653,72 @@ Status GemmOperationProfiler::initialize_configuration(
   return status;
 }
 
+void GemmOperationProfiler::update_workspace_(
+  GemmWorkspace &gemm_workspace,
+  gemm::GemmCoord const &problem_shape,
+  std::array<int64_t, 3> const &leading_dim,
+  std::array<int64_t, 3> const &preferred_cluster,
+  std::array<int64_t, 3> const &fallback_cluster,
+  cutlass::library::RasterOrder const &raster_order,
+  int swizzle_size
+) {
+
+  gemm_workspace.arguments.problem_size.m() = problem_shape.m();
+  gemm_workspace.arguments.problem_size.n() = problem_shape.n();
+  gemm_workspace.arguments.problem_size.k() = problem_shape.k();
+
+  gemm_workspace.arguments.lda = leading_dim[0];
+  gemm_workspace.arguments.ldb = leading_dim[1];
+  gemm_workspace.arguments.ldc = leading_dim[2];
+
+  gemm_workspace.arguments.swizzle_size = swizzle_size;
+  gemm_workspace.arguments.raster_order = raster_order;
+
+  gemm_workspace.arguments.cluster_shape = {int(preferred_cluster[0]), int(preferred_cluster[1]), int(preferred_cluster[2])};
+  gemm_workspace.arguments.cluster_shape_fallback = {int(fallback_cluster[0]), int(fallback_cluster[1]), int(fallback_cluster[2])};
+
+  gemm_workspace.configuration.problem_size.m() = problem_shape.m();
+  gemm_workspace.configuration.problem_size.n() = problem_shape.n();
+  gemm_workspace.configuration.problem_size.k() = problem_shape.k();
+
+  gemm_workspace.configuration.cluster_shape = {int(preferred_cluster[0]), int(preferred_cluster[1]), int(preferred_cluster[2])};
+  gemm_workspace.configuration.cluster_shape_fallback = {int(fallback_cluster[0]), int(fallback_cluster[1]), int(fallback_cluster[2])};
+
+  gemm_workspace.configuration.lda = leading_dim[0];
+  gemm_workspace.configuration.ldb = leading_dim[1];
+  gemm_workspace.configuration.ldc = leading_dim[2];
+
+}
+
+void GemmOperationProfiler::update_result_(
+  PerformanceResult &result,
+  library::GemmDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  gemm::GemmCoord const &problem_shape,
+  cutlass::library::RasterOrder const &raster_order,
+  std::array<int64_t, 3> const &preferred_cluster,
+  std::array<int64_t, 3> const &fallback_cluster,
+  int swizzle_size
+) {
+  result.bytes = problem_.bytes_with_problem_shape(operation_desc, problem_shape);
+  result.flops = problem_.flops_with_problem_shape(operation_desc, problem_shape);
+
+  set_argument(result, "m", problem_space, problem_shape.m());
+  set_argument(result, "n", problem_space, problem_shape.n());
+  set_argument(result, "k", problem_space, problem_shape.k());
+
+  set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
+  set_argument(result, "swizzle_size", problem_space, swizzle_size);
+
+  set_argument(result, "cluster_m", problem_space, preferred_cluster[0]);
+  set_argument(result, "cluster_n", problem_space, preferred_cluster[1]);
+  set_argument(result, "cluster_k", problem_space, preferred_cluster[2]);
+  set_argument(result, "cluster_m_fallback", problem_space, fallback_cluster[0]);
+  set_argument(result, "cluster_n_fallback", problem_space, fallback_cluster[1]);
+  set_argument(result, "cluster_k_fallback", problem_space, fallback_cluster[2]);
+
+}
+
 /// Initializes the performance result
 void GemmOperationProfiler::initialize_result_(
   PerformanceResult &result,
@@ -723,14 +838,25 @@ Status GemmOperationProfiler::initialize_workspace(
 
     bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
     if (allocate_device_tensors) {
+      bool enable_deep_profiling = options.profiling.enable_kernel_performance_search;
       int seed_shift = 0;
+
+      // When exhaustive performance search (deep profiling) option is enabled, device buffers are initialized to the largest problem shape
+      // so that later performance search can re-use those buffers.
+      int init_m = enable_deep_profiling ? std::max(int(problem_.m),  problem_.problem_sizes.back().m()) : int(problem_.m);
+      int init_n = enable_deep_profiling ? std::max(int(problem_.n),  problem_.problem_sizes.back().n()) : int(problem_.n);
+      int init_k = enable_deep_profiling ? std::max(int(problem_.k),  problem_.problem_sizes.back().k()) : int(problem_.k);
+      int init_lda = enable_deep_profiling ? int(std::max(problem_.lda,  problem_.leading_dims.back()[0])) : int(problem_.lda);
+      int init_ldb = enable_deep_profiling ? int(std::max(problem_.ldb,  problem_.leading_dims.back()[1])) : int(problem_.ldb);
+      int init_ldc = enable_deep_profiling ? int(std::max(problem_.ldc,  problem_.leading_dims.back()[2])) : int(problem_.ldc);
+
       gemm_workspace_[i].A = device_context.allocate_and_initialize_tensor(
         options,
         "A",
         operation_desc.A.element,
         operation_desc.A.layout,
-        {int(problem_.m), int(problem_.k)},
-        {int(problem_.lda)},
+        {init_m, init_k},
+        {init_lda},
         problem_.batch_count * gemm_workspace_[i].problem_count,
         seed_shift++,
         i // device_index
@@ -741,8 +867,8 @@ Status GemmOperationProfiler::initialize_workspace(
         "B",
         operation_desc.B.element,
         operation_desc.B.layout,
-        {int(problem_.k), int(problem_.n)},
-        {int(problem_.ldb)},
+        {init_k, init_n},
+        {init_ldb},
         problem_.batch_count * gemm_workspace_[i].problem_count,
         seed_shift++,
         i // device_index
@@ -753,8 +879,8 @@ Status GemmOperationProfiler::initialize_workspace(
         "C",
         operation_desc.C.element,
         operation_desc.C.layout,
-        {int(problem_.m), int(problem_.n)},
-        {int(problem_.ldc)},
+        {init_m, init_n},
+        {init_ldc},
         problem_.batch_count * gemm_workspace_[i].problem_count,
         seed_shift++,
         i // device_index
@@ -765,8 +891,8 @@ Status GemmOperationProfiler::initialize_workspace(
         "D",
         operation_desc.D.element,
         operation_desc.D.layout,
-        {int(problem_.m), int(problem_.n)},
-        {int(problem_.ldc)},
+        {init_m, init_n},
+        {init_ldc},
         problem_.batch_count * gemm_workspace_[i].problem_count,
         i // device_index
       );
@@ -776,13 +902,13 @@ Status GemmOperationProfiler::initialize_workspace(
         "Reference",
         operation_desc.D.element,
         operation_desc.D.layout,
-        {int(problem_.m), int(problem_.n)},
-        {int(problem_.ldc)},
+        {init_m, init_n},
+        {init_ldc},
         problem_.batch_count * gemm_workspace_[i].problem_count,
         i // device_index
       );
 
-      if (gemm_workspace_[i].arguments.is_mixed_dtype) {
+      if (gemm_workspace_[i].arguments.is_sm90_mixed_dtype) {
         // Dequantized tensor has the same shape of the narrow data type tensor,
         // and the same data type as the wide data type tensor
         // Encoded tensor has the same shape and data type of the narrow data type tensor
@@ -830,14 +956,14 @@ Status GemmOperationProfiler::initialize_workspace(
             i // device_index
           );
         }
-      }
+      }  // End of "if (gemm_workspace_[i].arguments.is_sm90_mixed_dtype)"
     }
 
     if (options.execution_mode != ExecutionMode::kDryRun) {
       // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
       gemm_workspace_[i].arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
       gemm_workspace_[i].arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
-      gemm_workspace_[i].arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+      gemm_workspace_[i].arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)};
       gemm_workspace_[i].arguments.split_k_slices = problem_.split_k_slices;
       gemm_workspace_[i].arguments.batch_count = problem_.batch_count;
       gemm_workspace_[i].arguments.lda = problem_.lda;
@@ -974,7 +1100,7 @@ bool GemmOperationProfiler::verify_cutlass(
     gemm_workspace_[i].arguments.batch_stride_C = gemm_workspace_[i].C->batch_stride();
     gemm_workspace_[i].arguments.batch_stride_D = gemm_workspace_[i].Computed->batch_stride();
 
-    if (gemm_workspace_[i].arguments.is_mixed_dtype) {
+    if (gemm_workspace_[i].arguments.is_sm90_mixed_dtype) {
       // Scale and zero already generated in initialize_configuration(),
       // A and B already generated in initialize_workspace(), signal
       // GemmUniversal3xOperation::update_arguments_() (trigger by underlying_operation->run())
@@ -1299,7 +1425,7 @@ bool GemmOperationProfiler::verify_with_reference_(
 
       cutlass::library::NumericTypeID element_A_for_reference = element_A;
       cutlass::library::NumericTypeID element_B_for_reference = element_B;
-      if (gemm_workspace_[i].arguments.is_mixed_dtype && gemm_workspace_[i].arguments.dequantized_AB_ready[0]) {
+      if (gemm_workspace_[i].arguments.is_sm90_mixed_dtype) {
         // Dequantized tensor has the same shape of the narrow data type tensor,
         // and the same data type as the wide data type tensor
         if (gemm_workspace_[i].arguments.wider_operand == cutlass::library::Sm90MixedInputWiderOperand::A) {
@@ -1444,42 +1570,213 @@ bool GemmOperationProfiler::profile(
 
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
-    for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-      // Initialize structure containing GEMM arguments
-      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->data();
-      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->data();
-      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->data();
-      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->data();
-      gemm_workspace_[i].arguments.alpha = problem_.alpha.data();
-      gemm_workspace_[i].arguments.beta = problem_.beta.data();
-      gemm_workspace_[i].arguments.pointer_mode = library::ScalarPointerMode::kHost;
-      gemm_workspace_[i].arguments.batch_stride_A = gemm_workspace_[i].A->batch_stride();
-      gemm_workspace_[i].arguments.batch_stride_B = gemm_workspace_[i].B->batch_stride();
-      gemm_workspace_[i].arguments.batch_stride_C = gemm_workspace_[i].C->batch_stride();
-      gemm_workspace_[i].arguments.batch_stride_D = gemm_workspace_[i].Computed->batch_stride();
+    // Case when we either screen the best performance number of kernels with or without a fixed problem shape fed in.
+    if (options.profiling.enable_kernel_performance_search || options.profiling.enable_best_kernel_for_fixed_shape) {
+      library::GemmDescription const &operation_desc =
+        static_cast<library::GemmDescription const &>(operation->description());
 
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        gemm_workspace_[i].arguments.D                       = gemm_workspace_[i].device_workspace.data();
-        gemm_workspace_[i].arguments.alpha                   = problem_.alpha_one.data();
-        gemm_workspace_[i].arguments.beta                    = problem_.beta_zero.data();
+      auto min_cc = operation_desc.tile_description.minimum_compute_capability;
 
-        gemm_workspace_[i].reduction_arguments.workspace     = gemm_workspace_[i].device_workspace.data();
-        gemm_workspace_[i].reduction_arguments.source        = gemm_workspace_[i].C->data();
-        gemm_workspace_[i].reduction_arguments.destination   = gemm_workspace_[i].Computed->data();
-        gemm_workspace_[i].reduction_arguments.alpha         = problem_.alpha.data();
-        gemm_workspace_[i].reduction_arguments.beta          = problem_.beta.data();
-        gemm_workspace_[i].reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+      bool is_dynamic_cluster_enabled = (min_cc >= 100);
+
+      // Helper function wrapping up performance test with flexible parameters.
+      auto initialize_and_profile = [&](
+        PerformanceResult const &result,
+        gemm::GemmCoord const &problem_shape,
+        std::array<int64_t, 3> const &leading_dim,
+        std::array<int64_t, 3> const &preferred_cluster,
+        std::array<int64_t, 3> const &fallback_cluster,
+        cutlass::library::RasterOrder const &raster_order,
+        int swizzle_size) -> std::optional<PerformanceResult> {
+
+        for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+          // Initialize structure containing GEMM arguments
+          auto& workspace = gemm_workspace_[i];
+          workspace.arguments.A = workspace.A->data();
+          workspace.arguments.B = workspace.B->data();
+          workspace.arguments.C = workspace.C->data();
+          workspace.arguments.D = workspace.Computed->data();
+          workspace.arguments.alpha = problem_.alpha.data();
+          workspace.arguments.beta = problem_.beta.data();
+          workspace.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+          workspace.arguments.batch_stride_A = workspace.A->batch_stride();
+          workspace.arguments.batch_stride_B = workspace.B->batch_stride();
+          workspace.arguments.batch_stride_C = workspace.C->batch_stride();
+          workspace.arguments.batch_stride_D = workspace.Computed->batch_stride();
+
+          if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+            workspace.arguments.D = workspace.device_workspace.data();
+            workspace.arguments.alpha = problem_.alpha_one.data();
+            workspace.arguments.beta = problem_.beta_zero.data();
+
+            workspace.reduction_arguments.workspace = workspace.device_workspace.data();
+            workspace.reduction_arguments.source = workspace.C->data();
+            workspace.reduction_arguments.destination = workspace.Computed->data();
+            workspace.reduction_arguments.alpha = problem_.alpha.data();
+            workspace.reduction_arguments.beta = problem_.beta.data();
+            workspace.reduction_arguments.pointer_mode = library::ScalarPointerMode::kHost;
+          }
+
+          update_workspace_(workspace, problem_shape, leading_dim, preferred_cluster, fallback_cluster, raster_order, swizzle_size);
+
+          const auto can_implement = operation->can_implement(&workspace.configuration, &workspace.arguments);
+          if (can_implement != Status::kSuccess) {
+            return std::nullopt;  // Return nullopt to indicate failure
+          }
+          library::Operation const* underlying_operation = operation;
+          cudaSetDevice(options.device.device_id(i));
+          uint64_t workspace_size = underlying_operation->get_host_workspace_size(&workspace.configuration);
+          workspace.host_workspace.resize(workspace_size, 0);
+
+          workspace_size = underlying_operation->get_device_workspace_size(&workspace.configuration,
+                                                                &workspace.arguments);
+          
+          bool is_sparse = operation_desc.tile_description.math_instruction.opcode_class == cutlass::library::OpcodeClassID::kSparseTensorOp;
+          if (is_sparse) {
+            // sparse gemm get_device_workspace_size() only return device workspace size per iteration
+            // Needs to multiply it w/ number of iteration
+            workspace_size *= workspace.problem_count;
+          }
+
+          workspace.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+          Status status = Status::kSuccess;
+
+          if (is_sparse) {
+            uint8_t* profiler_workspaces[1];
+            profiler_workspaces[0] = reinterpret_cast<uint8_t*>(workspace.A->data());
+            // Sparse operations have a different initialize interface.
+            // initialize_with_profiler_workspace converts mxk tensorA to compressed mxk/sp tensorA and the tensorE
+            auto modifiable_underlying_op = const_cast<library::Operation*>(underlying_operation);
+            status = modifiable_underlying_op->initialize_with_profiler_workspace(
+              &workspace.configuration,
+              workspace.host_workspace.data(),
+              workspace.device_workspace.data(),
+              profiler_workspaces,
+              workspace.problem_count,
+              workspace.stream);
+          }
+          else {
+            status = underlying_operation->initialize(
+              &workspace.configuration,
+              workspace.host_workspace.data(),
+              workspace.device_workspace.data(),
+              workspace.stream);
+          }
+
+          if (status != Status::kSuccess) {
+            return std::nullopt;  // Return nullopt to indicate failure
+          }
+
+        }
+
+        PerformanceResult curr_result(result);
+        update_result_(curr_result, operation_desc, problem_space, problem_shape, raster_order, preferred_cluster, fallback_cluster, swizzle_size);
+
+        curr_result.status = profile_cutlass_(
+          curr_result,
+          options,
+          operation,
+          nullptr,
+          nullptr,
+          nullptr
+        );
+
+        return curr_result;
+      };
+
+      // Helper function to test validity of fallback cluster shapes and preferred cluster shapes.
+      auto is_valid_dynamic_cluster_shape = [](const std::array<int64_t, 3>& preferred_cluster, const std::array<int64_t, 3>& fallback_cluster) {
+        for (size_t i = 0; i < 3; ++i) {
+          if (preferred_cluster[i] % fallback_cluster[i] != 0) {
+            return false;
+          }
+        }
+        return true;
+      };
+
+      // Helper function to select the best performance number among a list.
+      auto select_best_candidate = [&](std::vector<PerformanceResult> &candidates) {
+        assert(!candidates.empty() && "Candidates vector should not be empty");
+        auto best_iter = std::max_element(
+          candidates.begin(), candidates.end(),
+          [](PerformanceResult const &a, PerformanceResult const &b) {
+            return a.gflops_per_sec() < b.gflops_per_sec();
+          }
+        );
+        assert(best_iter != candidates.end() && "No candidate found despite non-empty candidates vector");
+        results_.push_back(std::move(*best_iter));
+      };
+
+      std::vector<PerformanceResult> candidates;
+      PerformanceResult result_base = results_.back();
+      results_.pop_back();
+      
+      bool dynamic_cluster = int64_t(operation_desc.tile_description.cluster_shape.m()) == 0 ||
+                             int64_t(operation_desc.tile_description.cluster_shape.n()) == 0 ||
+                             int64_t(operation_desc.tile_description.cluster_shape.k()) == 0;
+
+      std::vector<std::array<int64_t, 3>> preferred_clusters;
+      std::vector<std::array<int64_t, 3>> fallback_clusters;
+
+      // Only loop over built-in cluster shape lists for dynamic cluster kernels
+      // and for kernels that can leverage the dynamic cluster feature.
+      if (dynamic_cluster && is_dynamic_cluster_enabled) {
+        preferred_clusters = this->problem_.preferred_clusters;
+        fallback_clusters = this->problem_.fallback_clusters;
+      } 
+      else {
+        preferred_clusters = {{int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}};
+        fallback_clusters = {{int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}};
       }
+
+      for (auto preferred_cluster : preferred_clusters) {
+        for (auto fallback_cluster : fallback_clusters) {
+          if (dynamic_cluster && !is_valid_dynamic_cluster_shape(preferred_cluster, fallback_cluster)) {
+            continue;
+          }
+          for (auto swizzle_size : this->problem_.swizzle_sizes) {
+            for (auto raster_order : this->problem_.raster_orders) {
+              // With the fixed shape option turned on, only a specific problem shape is tested.
+              if (options.profiling.enable_best_kernel_for_fixed_shape) {
+                this->problem_.problem_sizes = {{int(this->problem_.m), int(this->problem_.n), int(this->problem_.k)}};
+                this->problem_.leading_dims = {{this->problem_.lda, this->problem_.ldb, this->problem_.ldc}};
+              }
+
+              for (int i = 0; i < int(this->problem_.problem_sizes.size()); i++) {
+                gemm::GemmCoord problem_shape = problem_.problem_sizes[i];
+                std::array<int64_t, 3> leading_dim = problem_.leading_dims[i];
+                auto result_opt = initialize_and_profile(result_base, problem_shape, leading_dim, preferred_cluster, fallback_cluster, raster_order, swizzle_size);
+                  
+                if (result_opt) {  // Only add valid results
+                  candidates.push_back(*result_opt);
+                }
+
+              }
+
+            }// for raster_order
+          }// for swizzle_size
+        }// for fallback_cluster
+      }// for swizzle_size
+
+      if (candidates.empty()) {
+        return false;
+      }
+
+      select_best_candidate(candidates);
+    }
+    // Basic case where we benchmark input parameters only.
+    else {
+      results_.back().status = profile_cutlass_(
+        results_.back(),
+        options,
+        operation,
+        nullptr,
+        nullptr,
+        nullptr
+      );
     }
 
-    results_.back().status = profile_cutlass_(
-      results_.back(),
-      options,
-      operation,
-      nullptr,
-      nullptr,
-      nullptr
-    );
   }
   return true;
 }
@@ -1512,7 +1809,7 @@ Status GemmOperationProfiler::profile_cutlass_(
     gemm_workspace_[dev_id].arguments.C = gemm_workspace_[dev_id].C->batch_data(problem_idx);
     gemm_workspace_[dev_id].arguments.D = gemm_workspace_[dev_id].Computed->batch_data(problem_idx);
 
-      if (gemm_workspace_[dev_id].arguments.is_mixed_dtype) {
+      if (gemm_workspace_[dev_id].arguments.is_sm90_mixed_dtype) {
         // Scale, zero, and dequantized tensors are already generated in
         // verify_cutlass(), no need to re-generate them in profiling
         gemm_workspace_[dev_id].arguments.generate_scale_and_zero = false;
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 25e19493..0573bd9d 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -478,6 +478,8 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
   cmdline.get_cmd_line_argument("profiling-duration", duration, 10);
   cmdline.get_cmd_line_argument("min-iterations", min_iterations, 10);
   cmdline.get_cmd_line_argument("use-cuda-graphs", use_cuda_graphs, false);
+  cmdline.get_cmd_line_argument("enable-kernel-performance-search", enable_kernel_performance_search, false);
+  cmdline.get_cmd_line_argument("enable-best-kernel-for-fixed-shape", enable_best_kernel_for_fixed_shape, false);
 
   if (cmdline.check_cmd_line_flag("providers")) {
 
@@ -683,7 +685,9 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
 
   cmdline.get_cmd_line_argument("verbose", verbose, true);
 
-  cmdline.get_cmd_line_argument("sort-results", sort_results, false);
+  cmdline.get_cmd_line_argument("sort-results-flops-per-byte", sort_flops_per_byte, false);
+
+  cmdline.get_cmd_line_argument("sort-results-flops-per-sec", sort_flops_per_sec, false);
 
   cmdline.get_cmd_line_argument("print-kernel-before-running", print_kernel_before_running, false);
 }
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index 50531d35..89433f51 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -160,7 +160,7 @@ void PerformanceReport::append_result(PerformanceResult result) {
   }
 }
 
-void PerformanceReport::sort_results(PerformanceResultVector &results) {
+void PerformanceReport::sort_flops_per_byte(PerformanceResultVector &results) {
 
   struct FlopsPerByteCompare
   {
@@ -176,6 +176,19 @@ void PerformanceReport::sort_results(PerformanceResultVector &results) {
   std::stable_sort(results.begin(), results.end(), FlopsPerByteCompare());
 }
 
+void PerformanceReport::sort_flops_per_sec(PerformanceResultVector &results) {
+
+  struct FlopsPerSecondCompare
+  {
+    bool operator()(const PerformanceResult &a, const PerformanceResult &b)
+    {
+      return a.gflops_per_sec() > b.gflops_per_sec();
+    }
+  };
+
+  std::stable_sort(results.begin(), results.end(), FlopsPerSecondCompare());
+}
+
 void PerformanceReport::append_results(PerformanceResultVector const &results) {
 
   if (options_.report.verbose) {
@@ -195,8 +208,12 @@ PerformanceReport::~PerformanceReport() {
   //
   if (options_.report.verbose && !concatenated_results_.empty()) {
 
-    if (options_.report.sort_results) {
-      sort_results(concatenated_results_);
+    if (options_.report.sort_flops_per_byte) {
+      sort_flops_per_byte(concatenated_results_);
+    }
+
+    if (options_.report.sort_flops_per_sec) {
+      sort_flops_per_sec(concatenated_results_);
     }
 
     std::cout << "\n\n";
diff --git a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
index 68f824eb..0c7a93a3 100644
--- a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
+++ b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
@@ -275,7 +275,7 @@ private:
 };
 
 // In the mainloop, PRMT selects 1 byte from only 8 bytes so the sign bit is handled in an extra PRMT.
-// Here the encodings of positive values and negative values are unified (except for the sign bit). 
+// Here the encodings of positive values and negative values are unified (except for the sign bit).
 // For instance, 1 becomes 0b0111, which is the same encoding as -1 (0b1111).
 static bool unified_encode_int4b(cutlass::int4b_t const *block_in, cutlass::int4b_t *block_out, const size_t block_size) {
 
@@ -430,7 +430,7 @@ void reorder_tensor(
   };
   static_assert(has_major_mode(stride<0>(LayoutDst{})) ^ has_major_mode(stride<1>(LayoutDst{})),
                 "Could not find stride-1 mode in destination layout");
-  constexpr int N = shape_div(Int<8>{}, sizeof_bits<T>{});
+  constexpr int N = shape_div(Int<8>{}, Int<sizeof_bits_v<T>>{});
   auto val_layout = conditional_return<has_major_mode(stride<0>(LayoutDst{}))>(
     make_layout(make_shape(Int<N>{}, Int<1>{}), GenColMajor{}),
     make_layout(make_shape(Int<1>{}, Int<N>{}), GenRowMajor{}));
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index c243ba30..d6b85ca1 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -55,6 +55,124 @@ namespace host {
 
 namespace detail {
 
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorGreatestErrorFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double result;
+
+  /// Ctor
+  TensorGreatestErrorFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    result(0.0) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    result = std::max(result, std::abs(double(lhs_) - double(rhs_)));
+  }
+
+  /// Returns true if equal
+  operator double() const {
+    return result;
+  }
+};
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorMREFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double sum;
+  uint64_t count;
+  static constexpr double epsilon = 1e-6;
+
+  /// Ctor
+  TensorMREFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    sum(0.0),
+    count(0) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    sum += std::abs(double(lhs_) - double(rhs_) / (double(rhs_) + epsilon));
+    ++count;
+  }
+
+  /// Returns true if equal
+  operator double() const {
+    return sum / double(count);
+  }
+};
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorMSEFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double sum;
+  uint64_t count;
+
+  /// Ctor
+  TensorMSEFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    sum(0.0),
+    count(0) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    sum += std::pow((double(lhs_) - double(rhs_)), 2);
+    ++count;
+  }
+
+  /// Returns true if equal
+  operator double() const {
+    return sum / double(count);
+  }
+};
+
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -144,6 +262,81 @@ struct TensorRelativelyEqualsFunc {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Returns the Mean Squared Error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorMSE(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+
+  detail::TensorMSEFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return double(func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the Mean Relative Error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorMRE(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+
+  detail::TensorMREFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return double(func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the greatest error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorGreatestError(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+
+  detail::TensorGreatestErrorFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return double(func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Returns true if two tensor views are equal.
 template <
   typename Element,               ///< Element type