diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc269c8b..9f423fb2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,31 @@
# NVIDIA CUTLASS Changelog
+
+## [3.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.0) (2025-03-20)
+
+* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
+ - Collective mainloops that target for:
+ * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+ * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+ - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+ - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
+ - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+ - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+ - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
+ - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+ - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+ - Support for [grouped GEMM with blockwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+ - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+ - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+* Added support for enhanced kernel performance search in CUTLASS:
+ - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
+ - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
+ - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
+ - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
+
## [3.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.8.0) (2025-01-25)
* Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
@@ -538,4 +564,3 @@ SPDX-License-Identifier: BSD-3-Clause
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65821237..1e6f298e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
+list(APPEND CUTLASS_CUDA_NVCC_FLAGS -ftemplate-backtrace-limit=0)
+
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
endif()
@@ -173,7 +175,7 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.8)
- list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a)
+ list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a 101 101a 120 120a)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
@@ -441,7 +443,7 @@ if (NOT MSVC AND CUTLASS_NVCC_KEEP)
# MSVC flow handles caching already, but for other generators we handle it here.
set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
file(MAKE_DIRECTORY ${CUTLASS_NVCC_KEEP_DIR})
- list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep -v) # --keep-dir may not work with nvcc for some directories.
+ list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep -v -objtemp) # --keep-dir may not work with nvcc for some directories.
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -save-temps=${CUTLASS_NVCC_KEEP_DIR})
endif()
@@ -468,6 +470,13 @@ if(UNIX)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-fno-strict-aliasing)
endif()
+# Known ctk11.4 issue (fixed later)
+# Also see https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-function-in-constexpr-if-fun
+if (CUDA_VERSION VERSION_LESS 11.5.0)
+ list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcudafe "--diag_suppress=implicit_return_from_non_void_function" )
+ message("CUDA_VERSION check pass ${CUDA_VERSION}")
+endif()
+
# Don't leak lineinfo in release builds
if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -gmlt)
@@ -1045,6 +1054,7 @@ function(cutlass_generate_profiler_tests NAME)
string(REGEX REPLACE "_cluster_k_fallback=[0-9]+" "" TEST_NAME "${TEST_NAME}")
string(REPLACE "runtime_input_datatype_a=" "" TEST_NAME "${TEST_NAME}")
string(REPLACE "runtime_input_datatype_b=" "" TEST_NAME "${TEST_NAME}")
+ string(REPLACE "swizzle_size=" "" TEST_NAME "${TEST_NAME}")
string(REGEX REPLACE "verification_enabled=(true|false)" "" TEST_NAME "${TEST_NAME}")
string(REGEX REPLACE "warmup_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
string(REGEX REPLACE "profiling_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 843ed365..46506007 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -128,3 +128,35 @@ Bryce Lelbach
Joel McCormack
Kyrylo Perelygin
Sean Treichler
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
index c91fc06a..176b42e4 100644
--- a/PUBLICATIONS.md
+++ b/PUBLICATIONS.md
@@ -2,10 +2,14 @@
## 2025
+- ["Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts"](https://arxiv.org/abs/2502.19811). Shulai Zhang, Ningxin Zheng, Haibin Lin, Ziheng Jiang, Wenlei Bao, Chengquan Jiang, Qi Hou, Weihao Cui, Size Zheng, Li-Wen Chang, Quan Chen, Xin Liu. _arXiv_, February 2025.
+
- ["ParetoQ: Scaling Laws in Extremely Low-bit LLM Quantization"](https://arxiv.org/abs/2502.02631). Zechun Liu, Changsheng Zhao, Hanxian Huang, Sijia Chen, Jing Zhang, Jiawei Zhao, Scott Roy, Lisa Jin, Yunyang Xiong, Yangyang Shi, Lin Xiao, Yuandong Tian, Bilge Soran, Raghuraman Krishnamoorthi, Tijmen Blankevoort, Vikas Chandra. _arXiv_, February 2025.
## 2024
+- ["DeepSeek-V3 Technical Report"](https://arxiv.org/abs/2412.19437). DeepSeek-AI. _arXiv_, December 2024.
+
- ["ShadowKV: KV Cache in Shadows for High-Throughput Long-Context LLM Inference"](https://arxiv.org/abs/2410.21465). Hanshi Sun, Li-Wen Chang, Wenlei Bao, Size Zheng, Ningxin Zheng, Xin Liu, Harry Dong, Yuejie Chi, Beidi Chen. _arXiv_, October 2024.
- ["FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion"](https://arxiv.org/abs/2406.06858). Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Chengji Yao, Ziheng Jiang, Haibin Lin, Xin Jin, Xin Liu. _arXiv_, June 2024.
@@ -64,3 +68,35 @@
"](https://arxiv.org/abs/2008.13006). Cong Guo, Bo Yang Hsueh, Jingwen Leng, Yuxian Qiu, Yue Guan, Zehuan Wang, Xiaoying Jia, Xipeng Li, Minyi Guo, Yuhao Zhu. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
- ["Strassen's Algorithm Reloaded on GPUs"](https://dl.acm.org/doi/10.1145/3372419). Jianyu Huang, Chenhan D. Yu, Robert A. van de Geijn. _ACM Transactions on Mathematical Software_, March 2020.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/README.md b/README.md
index ada18b39..77a81620 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@

-# CUTLASS 3.8.0
+# CUTLASS 3.9.0
-_CUTLASS 3.8.0 - January 2025_
+_CUTLASS 3.9.0 - March 2025_
CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-matrix multiplication (GEMM) and related computations at all levels
@@ -38,65 +38,30 @@ See the [functionality docs](./media/docs/functionality.md) for a more comprehen
list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU
architecture.
-# What's New in CUTLASS 3.8
+# What's New in CUTLASS 3.9
-CUTLASS 3.8 is the first release that supports the NVIDIA Blackwell SM100 architecture.
-For a background on Blackwell's new features, please consult the PTX documentation for CUDA 12.8.
-
-* Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
- - [5th generation Blackwell Tensor Core instructions (TCGen05)](./include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms.
- - Extensions to [Tensor Memory Accelerator](./include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms.
- - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp) across CuTe as a first class data locale.
- - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](./include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe.
- - [`make_tmem_copy()`](./include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms.
- - Support for [new variants of LDSM on Blackwell](./include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms.
-* Support for new CUTLASS building blocks specifically for Blackwell SM100 architecture:
- - Various narrow precision [FP4, FP6, and FP8](./include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](./include/cutlass/float_subbyte.h)
- - [Pipelines that implement Blackwell specific synchronization](./include/cutlass/pipeline/sm100_pipeline.hpp).
- - [Cluster launch control API supporting preferred and fallback cluster shapes](./include/cutlass/cluster_launch.hpp).
- - Data types including NVFP4, MXFP4, MXFP6, and MXFP8 and all their supported element and scale factor types.
- - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](./include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](./include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp).
- - Extensions to testbeds and reference check code for unit tests and CUTLASS profiler.
-* Full support for Blackwell SM100 kernels in CUTLASS 3.x API:
- - [Blackwell specific kernel layers](./include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that
- + Implement a new warp-specialization recipe tuned specifically for Blackwell SM100 architecture.
- + Leverage all the new features such as CLC based tile scheduling, preferred cluster, and TMEM based double buffering of accumulators.
- + Support stream-K load balancing for all kernel types everywhere via composable scheduler support.
- - Blackwell collective mainloops that target the TCGen05 MMA instructions (both SS and TS) for
- * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp)
- * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp)
- * [Block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp)
- * [Block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp)
- - Blackwell [collective mainloop for convolution kernels](./include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad.
- - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp), [convolution](./include/cutlass/conv/dispatch_policy.hpp), and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
- - [Blackwell epilogue that supports loading accumulators from `tmem`](./include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and [full set of EVT fusions]().
-* CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification.
- - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes.
- - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors.
- - Support for mixed input GEMM kernels on Hopper in the profiler.
-* New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels.
-* A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
-* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture:
- - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
- - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
- - Block scaled data type GEMMs targeting Blackwell's native block scaled Tensor Cores:
- + [NVFP4 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu)
- + [NVFP4 inputs with NVFP4 output](./examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu)
- + [Mixed MXFP8 and MXFP6 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu)
- - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](./examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy.
- - [GEMM with CLC based StreamK scheduler for load balancing](./examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu).
- - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
- - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
- - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
- - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
-* Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture:
- - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
- - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
-* Documentation updates:
- - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/quickstart.md#instantiating-a-blackwell-gemm-kernel).
- - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/blackwell_functionality.md)
- - A new [functionality documentation](./media/docs/functionality.md) specifically for 3.x API comprehensively documenting all supported kernel types, data types, kernel features, minimum CUDA tookit support etc for 3.x supported architectures.
- - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#Target-Architecture).
+* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
+ - Collective mainloops that target for:
+ * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+ * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+ - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+ - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
+ - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+ - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+ - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
+ - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+ - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+ - Support for [grouped GEMM with blockwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+ - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+ - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+* Added support for enhanced kernel performance search in CUTLASS:
+ - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
+ - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
+ - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
+ - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits.
CUTLASS team is working on a fix.
diff --git a/customConfigs.cmake b/customConfigs.cmake
index e39212db..d98fe6c5 100644
--- a/customConfigs.cmake
+++ b/customConfigs.cmake
@@ -65,10 +65,10 @@ endfunction()
if(CUTLASS_BUILD_FOR_PROFILER_REGRESSIONS)
- set(PROFILER_ARCH_LIST 100a)
+ set(PROFILER_ARCH_LIST 100a 101a 120a)
foreach(ARCH IN LISTS CUTLASS_NVCC_ARCHS)
if(NOT (ARCH IN_LIST PROFILER_ARCH_LIST))
- message(FATAL_ERROR "Only SM100a compute capability is supported with profiler-based unit tests")
+ message(FATAL_ERROR "Only SM100a/101a/120a compute capability is supported with profiler-based unit tests")
endif()
endforeach()
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
index 9fa8297d..ed9b2727 100644
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -115,4 +115,3 @@ SPDX-License-Identifier: BSD-3-Clause
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```
-
diff --git a/examples/40_cutlass_py/README.md b/examples/40_cutlass_py/README.md
index c670e340..02222f8e 100644
--- a/examples/40_cutlass_py/README.md
+++ b/examples/40_cutlass_py/README.md
@@ -2,3 +2,35 @@
This directory contains deprecated examples for PyCUTLASS, a precursor to the CUTLASS Python interface.
For examples of using CUTLASS's actively-maintained Pythonic interface, see the [examples/python](/examples/python) directory.
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/40_cutlass_py/customizable/README.md b/examples/40_cutlass_py/customizable/README.md
index e8aeee9e..b6863fb0 100644
--- a/examples/40_cutlass_py/customizable/README.md
+++ b/examples/40_cutlass_py/customizable/README.md
@@ -165,3 +165,35 @@ Example 7: GELU
```python
python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 0.0 -beta 0.5 -gm GemmSplitKParallel -k 5 -bias -activ gelu
```
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/55_hopper_mixed_dtype_gemm/README.md b/examples/55_hopper_mixed_dtype_gemm/README.md
index ca64c901..7c61e75c 100644
--- a/examples/55_hopper_mixed_dtype_gemm/README.md
+++ b/examples/55_hopper_mixed_dtype_gemm/README.md
@@ -41,3 +41,35 @@ We are currently optimizing the following cases:
* Optimizations for memory bound cases.
* Optimizations for scale and zero-point loading when the group size is not equal to the threadblock-k size.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md
index 4aac0536..2f3d8b83 100644
--- a/examples/59_ampere_gather_scatter_conv/README.md
+++ b/examples/59_ampere_gather_scatter_conv/README.md
@@ -207,3 +207,35 @@ With this in mind, this example kernel has the following limitations:
- This example kernel only supports dynamic image count, all other conv problem shape must be defined as `cute::Constant<>`s
- Problem shapes (including dynamic image count `N`) must be evenly divisible by the tile shape
- It does not perform fp32->tf32 numeric conversion, gmem inputs must be rounded to tf32 already
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
index c9f638e6..72f59476 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
+++ b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
@@ -26,11 +26,13 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-include_directories(
- .
-)
+set(TEST_PREFETCH_CASE --m=8192 --n=64 --k=8192 --iterations=0)
cutlass_example_add_executable(
63_hopper_gemm_with_weight_prefetch
63_hopper_gemm_with_weight_prefetch.cu
- )
+ TEST_COMMAND_OPTIONS
+ TEST_PREFETCH_CASE
+)
+
+target_include_directories(63_hopper_gemm_with_weight_prefetch PUBLIC .)
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/README.md b/examples/63_hopper_gemm_with_weight_prefetch/README.md
index 5dac1cc6..3fd615ff 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/README.md
+++ b/examples/63_hopper_gemm_with_weight_prefetch/README.md
@@ -74,9 +74,40 @@ echo "Overlap ratio of 0.8, prefetch ratio of 0.7"
However, note that the example still runs a single GEMM, and most of the performance improvement
is expected in end to end applications.
-
## Limitations
* The parameter defaults are typically not good choices, especially `prefetch_ratio`.
When `prefetch_ratio` is unspecified (set to `-1.0`), the prefetch warp will `try_wait` on a
memory barrier before issuing every single TMA load, and in many cases this will slow down
prefetching to the point of being almost ineffective.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
index 0c54bc05..73655ad2 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
@@ -362,11 +362,11 @@ public:
using ClusterSyncWithPrefetchBarrier = typename cutlass::arch::NamedBarrier;
auto prefetcher_arrive_barrier = ClusterSyncWithPrefetchBarrier(
blockDim.x * blockDim.y * blockDim.z,
- /*reserved_named_barriers_*/ 14);
+ /*id*/ 0);
// Prefetcher warp doesn't arrive on this barrier.
auto cluster_arrive_barrier = ClusterSyncWithPrefetchBarrier(
blockDim.x * blockDim.y * blockDim.z - NumThreadsPerWarp,
- /*reserved_named_barriers_*/ 15);
+ /*id*/ 1);
if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
__syncwarp();
diff --git a/examples/65_distributed_gemm/README.md b/examples/65_distributed_gemm/README.md
index fc53e6bf..e3c48a9d 100644
--- a/examples/65_distributed_gemm/README.md
+++ b/examples/65_distributed_gemm/README.md
@@ -62,3 +62,36 @@ procedure is the same, simply modify the following line in the example:
```cpp
using TP = _8;
```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
diff --git a/examples/65_distributed_gemm/REQUIREMENTS.md b/examples/65_distributed_gemm/REQUIREMENTS.md
index cc0d5632..4b8cca3b 100644
--- a/examples/65_distributed_gemm/REQUIREMENTS.md
+++ b/examples/65_distributed_gemm/REQUIREMENTS.md
@@ -84,3 +84,35 @@ GPU5 OK OK OK OK OK X OK OK
GPU6 OK OK OK OK OK OK X OK
GPU7 OK OK OK OK OK OK OK X
```
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index e4afcb30..1c21678f 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -100,7 +100,7 @@ using LayoutB = cutlass::layout::ColumnMajor; // L
constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
-using ElementC = cutlass::float_e4m3_t; // Element type for C and D matrix operands
+using ElementC = float; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
@@ -251,93 +251,93 @@ struct Result
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to initialize a block of device data
- template
- bool initialize_tensor(
- cutlass::TensorView view,
- cutlass::Distribution::Kind dist_kind,
- uint64_t seed) {
+template
+bool initialize_tensor(
+ cutlass::TensorView view,
+ cutlass::Distribution::Kind dist_kind,
+ uint64_t seed) {
- if (dist_kind == cutlass::Distribution::Uniform) {
+ if (dist_kind == cutlass::Distribution::Uniform) {
- double scope_max, scope_min;
- int bits_input = cutlass::sizeof_bits::value;
- int bits_output = cutlass::sizeof_bits::value;
+ double scope_max, scope_min;
+ int bits_input = cutlass::sizeof_bits::value;
+ int bits_output = cutlass::sizeof_bits::value;
- if (bits_input == 1) {
- scope_max = 2;
- scope_min = 0;
- } else if (bits_input <= 8) {
- scope_max = 2;
- scope_min = -2;
- } else if (bits_output == 16) {
- scope_max = 5;
- scope_min = -5;
- } else {
- scope_max = 8;
- scope_min = -8;
- }
-
- cutlass::reference::host::TensorFillRandomUniform(
- view, seed, scope_max, scope_min, 0);
- }
- else if (dist_kind == cutlass::Distribution::AllZeros) {
- cutlass::reference::host::TensorFill(view);
- }
- else if (dist_kind == cutlass::Distribution::Identity) {
-
- cutlass::reference::host::TensorFillIdentity(view);
- }
- else if (dist_kind == cutlass::Distribution::Gaussian) {
-
- cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
- }
- else if (dist_kind == cutlass::Distribution::Sequential) {
- cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
- }
- else {
- throw std::runtime_error("Not implementated.");
+ if (bits_input == 1) {
+ scope_max = 2;
+ scope_min = 0;
+ } else if (bits_input <= 8) {
+ scope_max = 2;
+ scope_min = -2;
+ } else if (bits_output == 16) {
+ scope_max = 5;
+ scope_min = -5;
+ } else {
+ scope_max = 8;
+ scope_min = -8;
}
- return true;
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min, bits_input);
}
+ else if (dist_kind == cutlass::Distribution::AllZeros) {
+ cutlass::reference::host::TensorFill(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Identity) {
+
+ cutlass::reference::host::TensorFillIdentity(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+ cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+ }
+ else if (dist_kind == cutlass::Distribution::Sequential) {
+ cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+ }
+ else {
+ throw std::runtime_error("Not implementated.");
+ }
+
+ return true;
+}
/// Helper to initialize a block of device data (scale_tensors)
- template
- bool initialize_scale_tensor(
- cutlass::TensorView view,
- cutlass::Distribution::Kind dist_kind,
- uint64_t seed) {
+template
+bool initialize_scale_tensor(
+ cutlass::TensorView view,
+ cutlass::Distribution::Kind dist_kind,
+ uint64_t seed) {
- if (dist_kind == cutlass::Distribution::Uniform) {
+ if (dist_kind == cutlass::Distribution::Uniform) {
- double scope_max, scope_min;
+ double scope_max, scope_min;
- scope_min = -1;
- scope_max = 1;
+ scope_min = -1;
+ scope_max = 1;
- cutlass::reference::host::TensorFillRandomUniform(
- view, seed, scope_max, scope_min, 0);
- }
- else if (dist_kind == cutlass::Distribution::AllZeros) {
- cutlass::reference::host::TensorFill(view);
- }
- else if (dist_kind == cutlass::Distribution::Identity) {
-
- cutlass::reference::host::TensorFillIdentity(view);
- }
- else if (dist_kind == cutlass::Distribution::Gaussian) {
-
- cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
- }
- else if (dist_kind == cutlass::Distribution::Sequential) {
- cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
- }
- else {
- throw std::runtime_error("Not implementated.");
- }
-
- return true;
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min);
}
+ else if (dist_kind == cutlass::Distribution::AllZeros) {
+ cutlass::reference::host::TensorFill(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Identity) {
+
+ cutlass::reference::host::TensorFillIdentity(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+ cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+ }
+ else if (dist_kind == cutlass::Distribution::Sequential) {
+ cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+ }
+ else {
+ throw std::runtime_error("Not implementated.");
+ }
+
+ return true;
+}
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const Options &options) {
@@ -438,14 +438,18 @@ void initialize(const Options &options) {
if (IsDFp8 && options.save_amax) {
abs_max_D.resize(cutlass::make_Coord(1));
+ initialize_tensor(abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
abs_max_D.sync_device();
reference_abs_max_D.resize(cutlass::make_Coord(1));
+ initialize_tensor(reference_abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
}
if (IsAuxFp8 && options.save_aux && options.save_amax) {
abs_max_aux.resize(cutlass::make_Coord(1));
+ initialize_tensor(abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
abs_max_aux.sync_device();
reference_abs_max_aux.resize(cutlass::make_Coord(1));
+ initialize_tensor(reference_abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
}
}
@@ -517,10 +521,9 @@ bool verify(const Options &options) {
// Block scaling tensors shapes based CTA Block (TileShape) and GEMM Problem shape
auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
- auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
- auto blockscale_m = cute::get<0>(blockscale_shape);
- auto blockscale_n = cute::get<1>(blockscale_shape);
- auto blockscale_k = cute::get<2>(blockscale_shape);
+ auto blockscale_m = ceil_div(options.m, get<0>(TileShape{}));
+ auto blockscale_n = ceil_div(options.n, get<1>(TileShape{}));
+ auto blockscale_k = ceil_div(options.k, get<2>(TileShape{}));
// Create instantiation for device reference gemm kernel
auto A = cute::make_tensor(tensor_A.host_data(),
@@ -608,29 +611,40 @@ bool verify(const Options &options) {
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
// compare_reference
+ bool passed = true;
tensor_D.sync_host();
- bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+ passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_D.host_view(), tensor_ref_D.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+ double mse = cutlass::reference::host::TensorMSE(tensor_D.host_view(), tensor_ref_D.host_view());
+ double mre = cutlass::reference::host::TensorMRE(tensor_D.host_view(), tensor_ref_D.host_view());
+ double max_error = cutlass::reference::host::TensorGreatestError(tensor_D.host_view(), tensor_ref_D.host_view());
+ std::cout << " Result MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
- if (false) {
- std::cout << "tensor_ref_D.host_view() {" << std::endl
- << tensor_ref_D.host_view() << std::endl
- << "}" << std::endl;
- std::cout << "tensor_D.host_view() {" << std::endl
- << tensor_D.host_view() << std::endl
- << "}" << std::endl;
- }
+#if 0
+ std::cout << "tensor_ref_D.host_view() {" << std::endl
+ << tensor_ref_D.host_view() << std::endl
+ << "}" << std::endl;
+ std::cout << "tensor_D.host_view() {" << std::endl
+ << tensor_D.host_view() << std::endl
+ << "}" << std::endl;
+#endif
if (IsDFp8 && options.save_amax) {
abs_max_D.sync_host();
- passed &= abs_max_D.at(cutlass::make_Coord(0)) == reference_abs_max_D.at(cutlass::make_Coord(0));
+ std::cout << " Abs max D: " << abs_max_D.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_D.at(cutlass::make_Coord(0)) << std::endl;
+ passed &= cutlass::relatively_equal(abs_max_D.at(cutlass::make_Coord(0)), reference_abs_max_D.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
}
if (options.save_aux) {
tensor_aux.sync_host();
- passed &= cutlass::reference::host::TensorEquals(tensor_ref_aux.host_view(), tensor_aux.host_view());
+ passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_aux.host_view(), tensor_ref_aux.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+ mse = cutlass::reference::host::TensorMSE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ mre = cutlass::reference::host::TensorMRE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ max_error = cutlass::reference::host::TensorGreatestError(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ std::cout << " Aux MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
if (IsAuxFp8 && options.save_amax) {
abs_max_aux.sync_host();
- passed &= abs_max_aux.at(cutlass::make_Coord(0)) == reference_abs_max_aux.at(cutlass::make_Coord(0));
+ std::cout << " Abs max aux: " << abs_max_aux.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_aux.at(cutlass::make_Coord(0)) << std::endl;
+ passed &= cutlass::relatively_equal(abs_max_aux.at(cutlass::make_Coord(0)), reference_abs_max_aux.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
}
}
@@ -671,10 +685,9 @@ int run(Options &options)
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
}
-
- // if (!result.passed) {
- // exit(-1);
- // }
+ else {
+ result.passed = true;
+ }
// Run profiling loop
if (options.iterations > 0)
@@ -707,7 +720,7 @@ int run(Options &options)
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
- return 0;
+ return result.passed;
}
#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
@@ -753,7 +766,9 @@ int main(int argc, char const **args) {
//
#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
- run(options);
+ bool passed = run(options);
+ if (!passed)
+ return -1;
#endif
return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index 03945764..b7cdb00a 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -100,7 +100,7 @@ using LayoutB = cutlass::layout::ColumnMajor; // L
constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
-using ElementC = cutlass::float_e4m3_t; // Element type for C and D matrix operands
+using ElementC = float; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
@@ -303,93 +303,93 @@ struct Result
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to initialize a block of device data
- template
- bool initialize_tensor(
- cutlass::TensorView view,
- cutlass::Distribution::Kind dist_kind,
- uint64_t seed) {
+template
+bool initialize_tensor(
+ cutlass::TensorView view,
+ cutlass::Distribution::Kind dist_kind,
+ uint64_t seed) {
- if (dist_kind == cutlass::Distribution::Uniform) {
+ if (dist_kind == cutlass::Distribution::Uniform) {
- double scope_max, scope_min;
- int bits_input = cutlass::sizeof_bits::value;
- int bits_output = cutlass::sizeof_bits::value;
+ double scope_max, scope_min;
+ int bits_input = cutlass::sizeof_bits::value;
+ int bits_output = cutlass::sizeof_bits::value;
- if (bits_input == 1) {
- scope_max = 2;
- scope_min = 0;
- } else if (bits_input <= 8) {
- scope_max = 2;
- scope_min = -2;
- } else if (bits_output == 16) {
- scope_max = 5;
- scope_min = -5;
- } else {
- scope_max = 8;
- scope_min = -8;
- }
-
- cutlass::reference::host::TensorFillRandomUniform(
- view, seed, scope_max, scope_min, 0);
- }
- else if (dist_kind == cutlass::Distribution::AllZeros) {
- cutlass::reference::host::TensorFill(view);
- }
- else if (dist_kind == cutlass::Distribution::Identity) {
-
- cutlass::reference::host::TensorFillIdentity(view);
- }
- else if (dist_kind == cutlass::Distribution::Gaussian) {
-
- cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
- }
- else if (dist_kind == cutlass::Distribution::Sequential) {
- cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
- }
- else {
- throw std::runtime_error("Not implementated.");
+ if (bits_input == 1) {
+ scope_max = 2;
+ scope_min = 0;
+ } else if (bits_input <= 8) {
+ scope_max = 2;
+ scope_min = -2;
+ } else if (bits_output == 16) {
+ scope_max = 5;
+ scope_min = -5;
+ } else {
+ scope_max = 8;
+ scope_min = -8;
}
- return true;
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min, bits_input);
}
+ else if (dist_kind == cutlass::Distribution::AllZeros) {
+ cutlass::reference::host::TensorFill(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Identity) {
+
+ cutlass::reference::host::TensorFillIdentity(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+ cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+ }
+ else if (dist_kind == cutlass::Distribution::Sequential) {
+ cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+ }
+ else {
+ throw std::runtime_error("Not implementated.");
+ }
+
+ return true;
+}
/// Helper to initialize a block of device data (scale_tensors)
- template
- bool initialize_scale_tensor(
- cutlass::TensorView view,
- cutlass::Distribution::Kind dist_kind,
- uint64_t seed) {
+template
+bool initialize_scale_tensor(
+ cutlass::TensorView view,
+ cutlass::Distribution::Kind dist_kind,
+ uint64_t seed) {
- if (dist_kind == cutlass::Distribution::Uniform) {
+ if (dist_kind == cutlass::Distribution::Uniform) {
- double scope_max, scope_min;
+ double scope_max, scope_min;
- scope_min = -1;
- scope_max = 1;
+ scope_min = -1;
+ scope_max = 1;
- cutlass::reference::host::TensorFillRandomUniform(
- view, seed, scope_max, scope_min, 0);
- }
- else if (dist_kind == cutlass::Distribution::AllZeros) {
- cutlass::reference::host::TensorFill(view);
- }
- else if (dist_kind == cutlass::Distribution::Identity) {
-
- cutlass::reference::host::TensorFillIdentity(view);
- }
- else if (dist_kind == cutlass::Distribution::Gaussian) {
-
- cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
- }
- else if (dist_kind == cutlass::Distribution::Sequential) {
- cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
- }
- else {
- throw std::runtime_error("Not implementated.");
- }
-
- return true;
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min);
}
+ else if (dist_kind == cutlass::Distribution::AllZeros) {
+ cutlass::reference::host::TensorFill(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Identity) {
+
+ cutlass::reference::host::TensorFillIdentity(view);
+ }
+ else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+ cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+ }
+ else if (dist_kind == cutlass::Distribution::Sequential) {
+ cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+ }
+ else {
+ throw std::runtime_error("Not implementated.");
+ }
+
+ return true;
+}
/// Initialize operands to be used in the GEMM and reference GEMM
template
@@ -403,11 +403,9 @@ void initialize(const Options &options) {
assert(options.n % ScaleGranularityN == 0);
// Find Group Scaling tensor shapes based on `ScaleGranularityM`, problem shape, and TileShape
- auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
- auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
- auto groupscale_m = cute::get<0>(gemm_problem_shape) / ScaleGranularityM;
- auto groupscale_n = cute::get<1>(gemm_problem_shape) / ScaleGranularityN;
- auto blockscale_k = cute::get<2>(blockscale_shape);
+ auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
+ auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
+ auto blockscale_k = ceil_div(options.k, cute::get<2>(TileShape{}));
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
@@ -582,13 +580,11 @@ bool verify(const Options &options, const int ScaleMsPerTile
const int ScaleGranularityN = get<1>(TileShape_{}) / ScaleNsPerTile;
// Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
- auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
- auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape_{})));
- auto blockscale_m = cute::get<0>(blockscale_shape);
- auto blockscale_n = cute::get<1>(blockscale_shape);
- auto blockscale_k = cute::get<2>(blockscale_shape);
- auto groupscale_m = get<0>(gemm_problem_shape) / ScaleGranularityM;
- auto groupscale_n = get<1>(gemm_problem_shape) / ScaleGranularityN;
+ auto blockscale_m = ceil_div(options.m, get<0>(TileShape_{}));
+ auto blockscale_n = ceil_div(options.n, get<1>(TileShape_{}));
+ auto blockscale_k = ceil_div(options.k, get<2>(TileShape_{}));
+ auto groupscale_m = ceil_div(options.m, ScaleGranularityM);
+ auto groupscale_n = ceil_div(options.n, ScaleGranularityN);
// Create instantiation for device reference gemm kernel
auto A = cute::make_tensor(tensor_A.host_data(),
@@ -676,8 +672,13 @@ bool verify(const Options &options, const int ScaleMsPerTile
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
// compare_reference
+ bool passed = true;
tensor_D.sync_host();
- bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+ passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_D.host_view(), tensor_ref_D.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+ double mse = cutlass::reference::host::TensorMSE(tensor_D.host_view(), tensor_ref_D.host_view());
+ double mre = cutlass::reference::host::TensorMRE(tensor_D.host_view(), tensor_ref_D.host_view());
+ double max_error = cutlass::reference::host::TensorGreatestError(tensor_D.host_view(), tensor_ref_D.host_view());
+ std::cout << " Result MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
#if 0
std::cout << "tensor_ref_D.host_view() {" << std::endl
@@ -690,15 +691,21 @@ bool verify(const Options &options, const int ScaleMsPerTile
if (IsDFp8 && options.save_amax) {
abs_max_D.sync_host();
- passed &= abs_max_D.at(cutlass::make_Coord(0)) == reference_abs_max_D.at(cutlass::make_Coord(0));
+ std::cout << " Abs max D: " << abs_max_D.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_D.at(cutlass::make_Coord(0)) << std::endl;
+ passed &= cutlass::relatively_equal(abs_max_D.at(cutlass::make_Coord(0)), reference_abs_max_D.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
}
if (options.save_aux) {
tensor_aux.sync_host();
- passed &= cutlass::reference::host::TensorEquals(tensor_ref_aux.host_view(), tensor_aux.host_view());
+ passed &= cutlass::reference::host::TensorRelativelyEquals(tensor_aux.host_view(), tensor_ref_aux.host_view(), ElementAux(options.epsilon), ElementAux(options.non_zero_floor));
+ mse = cutlass::reference::host::TensorMSE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ mre = cutlass::reference::host::TensorMRE(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ max_error = cutlass::reference::host::TensorGreatestError(tensor_aux.host_view(), tensor_ref_aux.host_view());
+ std::cout << " Aux MSE: " << mse << ", MRE: " << mre << ", greatest error: " << max_error << std::endl;
if (IsAuxFp8 && options.save_amax) {
abs_max_aux.sync_host();
- passed &= abs_max_aux.at(cutlass::make_Coord(0)) == reference_abs_max_aux.at(cutlass::make_Coord(0));
+ std::cout << " Abs max aux: " << abs_max_aux.at(cutlass::make_Coord(0)) << ", reference: " << reference_abs_max_aux.at(cutlass::make_Coord(0)) << std::endl;
+ passed &= cutlass::relatively_equal(abs_max_aux.at(cutlass::make_Coord(0)), reference_abs_max_aux.at(cutlass::make_Coord(0)), ElementScalar(options.epsilon), ElementScalar(options.non_zero_floor));
}
}
@@ -716,29 +723,29 @@ int run(Options &options)
const int ScaleNsPerTile = GroupScaleConfig::ScaleNsPerTile;
bool skip = false;
-
- if (options.m % ScaleGranularityM != 0) {
- std::cout << "Skippig (m size: " << options.m << " less then ScaleGranularityM: " << ScaleGranularityM << "):" << std::endl;
- skip = true;
- }
-
- if (options.n % ScaleGranularityN != 0) {
- std::cout << "Skippig (n size: " << options.m << " less then ScaleGranularityN: " << ScaleGranularityM << "):" << std::endl;
- skip = true;
- }
-
- if (options.k % size<2>(TileShape{}) != 0) {
- std::cout << "Skippig (k size: " << options.k << " less then TileShape[2]: " << size<2>(TileShape{}) << "):" << std::endl;
- skip = true;
- }
-
- if (!skip) std::cout << "Running: " << std::endl;
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
std::cout << " Tile shape (M, N, K): " << size<0>(TileShape{}) << ", " << size<1>(TileShape{}) << ", " << size<2>(TileShape{}) << std::endl;
std::cout << " ScaleGranularityM: " << ScaleGranularityM << " (ScaleMsPerTile: " << ScaleMsPerTile << ")" << std::endl;
std::cout << " ScaleGranularityN: " << ScaleGranularityN << " (ScaleNsPerTile: " << ScaleNsPerTile << ")" << std::endl;
- if (skip) return -1;
+
+ if (options.m < ScaleGranularityM) {
+ std::cout << " Skippig (m size: " << options.m << " less than ScaleGranularityM: " << ScaleGranularityM << "):" << std::endl;
+ skip = true;
+ }
+
+ if (options.n < ScaleGranularityN) {
+ std::cout << " Skippig (n size: " << options.n << " less than ScaleGranularityN: " << ScaleGranularityN << "):" << std::endl;
+ skip = true;
+ }
+
+ if (options.k < size<2>(TileShape{})) {
+ std::cout << " Skippig (k size: " << options.k << " less than TileShape[2]: " << size<2>(TileShape{}) << "):" << std::endl;
+ skip = true;
+ }
+
+ if (!skip) std::cout << " Running... " << std::endl;
+ else return -1;
initialize(options);
@@ -770,17 +777,17 @@ int run(Options &options)
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
}
-
- if (!result.passed) {
- exit(-1);
+ else {
+ result.passed = true;
}
// Run profiling loop
if (options.iterations > 0)
{
GpuTimer timer;
- timer.start();
- for (int iter = 0; iter < options.iterations; ++iter) {
+ for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
+ if (iter == options.warmup)
+ timer.start();
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
CUTLASS_CHECK(gemm.run());
}
@@ -806,7 +813,7 @@ int run(Options &options)
fflush(stdout);
}
- return 0;
+ return result.passed;
}
#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
@@ -852,27 +859,31 @@ int main(int argc, char const **args) {
//
#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+ bool passed = true;
std::cout << "Basic split-K GEMM kernel" << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
std::cout << std::endl;
std::cout << "StreamK GEMM kernel" << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
- run(options);
+ passed &= run(options);
std::cout << std::endl;
+
+ if (!passed)
+ return -1;
#endif
return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
index 23f05ada..85aff756 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
@@ -46,6 +46,8 @@ struct Options {
int m = 1024, n = 512, k = 1024, l = 1;
RasterOrderOptions raster;
int swizzle;
+ float epsilon = 0.02f;
+ float non_zero_floor = 1.f;
// Parses the command line
void parse(int argc, char const **args) {
@@ -73,6 +75,8 @@ struct Options {
cmd.get_cmd_line_argument("warmup", warmup);
cmd.get_cmd_line_argument("iterations", iterations);
cmd.get_cmd_line_argument("verify", verify);
+ cmd.get_cmd_line_argument("epsilon", epsilon);
+ cmd.get_cmd_line_argument("non-zero-floor", non_zero_floor);
char raster_char;
cmd.get_cmd_line_argument("raster", raster_char);
@@ -113,7 +117,10 @@ struct Options {
<< " --save_amax= Save the pre-scaled max absolute value of any fp8 outputs (aux and/or D) (default: true)\n"
<< " --raster= CTA Rasterization direction (N for along N, M for along M, and H for heuristic)\n\n"
<< " --swizzle= CTA Rasterization swizzle\n\n"
- << " --iterations= Number of profiling iterations to perform.\n\n";
+ << " --iterations= Number of profiling iterations to perform.\n\n"
+ << " --verify= Verify the results.\n\n"
+ << " --epsilon= The epsilon value for comparing the results.\n\n"
+ << " --non-zero-floor= The none zero floor for comparing the results.\n\n";
out
<< "\n\nExamples:\n\n"
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
index 6bb593bd..0bf90a41 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
@@ -221,9 +221,9 @@ void gett_mainloop(
const int N = cute::size<0>(mainloop_params.B.layout());
const int ScaleGranularityM = M / cute::size<0>(mainloop_params.ScaleA);
const int ScaleGranularityN = N / cute::size<0>(mainloop_params.ScaleB);
- assert(ScaleGranularityM && M % ScaleGranularityM == 0
+ assert(ScaleGranularityM && M % ScaleGranularityM == 0
&& "ScaleGranularityM must divide M");
- assert(ScaleGranularityN && N % ScaleGranularityN == 0
+ assert(ScaleGranularityN && N % ScaleGranularityN == 0
&& "ScaleGranularityN must divide N");
cute::Tensor blockscale_A = domain_offset(
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/README.md b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
index 272d36e5..f4d71ea3 100644
--- a/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
@@ -12,3 +12,35 @@ Note that in Example 55, the argument `--g` is used to determine the block scale
## Upcoming features
Currently, the Mixed-input Grouped GEMM only supports row-wise scaling. Please contact us if zero-points or block-wise scaling are needed.
+
+## Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
index 3cee6caf..19d6b89d 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
@@ -194,12 +194,14 @@ struct Options {
float alpha, beta;
int iterations;
int m, n, k;
+ int swizzle;
Options():
help(false),
m(8192), n(8192), k(8192),
alpha(1.f), beta(0.f),
- iterations(10)
+ iterations(10),
+ swizzle(0)
{ }
// Parses the command line
@@ -217,6 +219,7 @@ struct Options {
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -231,6 +234,7 @@ struct Options {
<< " --k= Sets the K extent of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
<< " --beta= Epilogue scalar beta\n\n"
+ << " --swizzle= Cluster rasterization swizzle\n\n"
<< " --iterations= Number of profiling iterations to perform.\n\n";
out
@@ -331,6 +335,8 @@ typename Gemm::Arguments args_from_options(const Options &options)
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
};
+ arguments.scheduler.max_swizzle_size = options.swizzle;
+
return arguments;
}
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
index 69a36310..d476ce00 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
@@ -231,6 +231,7 @@ struct Options {
bool save_amax = true;
int iterations = 1000;
int m = 1024, n = 512, k = 1024, l = 1;
+ int swizzle = 0;
// Parses the command line
void parse(int argc, char const **args) {
@@ -256,6 +257,7 @@ struct Options {
cmd.get_cmd_line_argument("save_aux", save_aux, true);
cmd.get_cmd_line_argument("save_amax", save_amax, true);
cmd.get_cmd_line_argument("iterations", iterations);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -271,6 +273,7 @@ struct Options {
<< " --l= Sets the l extent (batch) of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
<< " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n"
<< " --scale_a= Scaling factor for A\n"
<< " --scale_b= Scaling factor for B\n"
<< " --scale_c= Scaling factor for C\n"
@@ -476,6 +479,8 @@ typename Gemm::Arguments args_from_options(const Options &options)
fusion_args.amax_D_ptr = abs_max_D.device_data();
}
+ arguments.scheduler.max_swizzle_size = options.swizzle;
+
return arguments;
}
diff --git a/examples/70_blackwell_gemm/CMakeLists.txt b/examples/70_blackwell_gemm/CMakeLists.txt
index cb401e3a..0ac1687d 100644
--- a/examples/70_blackwell_gemm/CMakeLists.txt
+++ b/examples/70_blackwell_gemm/CMakeLists.txt
@@ -28,14 +28,29 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+set(TEST_SWIZZLE_1 --swizzle=1)
+set(TEST_SWIZZLE_2 --swizzle=2)
+set(TEST_SWIZZLE_5 --swizzle=5)
+set(TEST_SWIZZLE_5_UNEVEN --swizzle=5 --m=4096 --n=16384)
+
+if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
cutlass_example_add_executable(
70_blackwell_fp16_gemm
70_blackwell_fp16_gemm.cu
-)
+ TEST_COMMAND_OPTIONS
+ TEST_SWIZZLE_1
+ TEST_SWIZZLE_2
+ TEST_SWIZZLE_5
+ TEST_SWIZZLE_5_UNEVEN
+)
cutlass_example_add_executable(
70_blackwell_fp8_gemm
70_blackwell_fp8_gemm.cu
+ TEST_COMMAND_OPTIONS
+ TEST_SWIZZLE_1
+ TEST_SWIZZLE_2
+ TEST_SWIZZLE_5
+ TEST_SWIZZLE_5_UNEVEN
)
endif()
diff --git a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
index 427af254..f911262f 100644
--- a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
+++ b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
@@ -74,12 +74,14 @@ struct Options {
int m, n, k, l;
float alpha, beta;
+ int swizzle;
Options():
help(false),
error(false),
m(2048), n(2048), k(2048), l(1),
- alpha(1.f), beta(0.f)
+ alpha(1.f), beta(0.f),
+ swizzle(0)
{ }
// Parses the command line
@@ -97,6 +99,7 @@ struct Options {
cmd.get_cmd_line_argument("l", l, 1);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -112,7 +115,8 @@ struct Options {
<< " --k= Sets the K extent of the GEMM\n"
<< " --l= Sets the L extent (batch count) of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
- << " --beta= Epilogue scalar beta\n\n";
+ << " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n\n";
return out;
}
@@ -352,6 +356,8 @@ struct ExampleRunner {
hw_info
};
+ arguments.scheduler.max_swizzle_size = options.swizzle;
+
// See example 48 for details on custom EVT construction
if constexpr (UseCustomEVT) {
arguments.epilogue.thread =
diff --git a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
index f7e12fbf..f729b43d 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
@@ -211,12 +211,14 @@ struct Options {
float alpha, beta;
int iterations;
int m, n, k;
+ int swizzle = 0;
Options():
help(false),
m(1024), n(1024), k(1024),
alpha(1.f), beta(0.f),
- iterations(10)
+ iterations(10),
+ swizzle(0)
{ }
// Parses the command line
@@ -234,6 +236,7 @@ struct Options {
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -247,7 +250,8 @@ struct Options {
<< " --n= Sets the N extent of the GEMM\n"
<< " --k= Sets the K extent of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
- << " --beta= Epilogue scalar beta\n\n"
+ << " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n"
<< " --iterations= Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
@@ -333,7 +337,7 @@ bool initialize_block(
void initialize(const Options &options) {
using namespace cute;
// For SFA and SFB tensors layouts
- using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -344,8 +348,8 @@ void initialize(const Options &options) {
layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
- layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
- layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
block_A.reset(cutlass::make_Coord(size(layout_A)));
block_B.reset(cutlass::make_Coord(size(layout_B)));
@@ -387,6 +391,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
}
};
+ arguments.scheduler.max_swizzle_size = options.swizzle;
return arguments;
}
diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
index 2719cab9..75d3437d 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@@ -177,7 +177,7 @@ using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
using FusionOp = typename Gemm::EpilogueOutputOp;
constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
-using SfdOutputCfg = cutlass::detail::Sm100BlockScaledOutputConfig;
+using SfdOutputCfg = cutlass::detail::Sm1xxBlockScaledOutputConfig;
using LayoutSFD = typename SfdOutputCfg::LayoutSF;
//
@@ -240,12 +240,14 @@ struct Options {
float alpha, beta;
int iterations;
int m, n, k;
+ int swizzle = 0;
Options():
help(false),
m(1024), n(1024), k(1024),
alpha(1.f), beta(0.f),
- iterations(10)
+ iterations(10),
+ swizzle(0)
{ }
// Parses the command line
@@ -263,6 +265,7 @@ struct Options {
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -276,7 +279,8 @@ struct Options {
<< " --n= Sets the N extent of the GEMM\n"
<< " --k= Sets the K extent of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
- << " --beta= Epilogue scalar beta\n\n"
+ << " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n"
<< " --iterations= Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
@@ -362,9 +366,9 @@ bool initialize_block(
void initialize(const Options &options) {
using namespace cute;
// For SFA and SFB tensors layouts
- using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
// For SFD tensor layout
- using Sm100BlockScaledOutputConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+ using Sm1xxBlockScaledOutputConfig= typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -375,8 +379,8 @@ void initialize(const Options &options) {
layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
- layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
- layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
layout_SFD = SfdOutputCfg::tile_atom_to_shape_SFD(cute::make_shape(options.m, options.n, options.k, 1));
block_A.reset(cutlass::make_Coord(size(layout_A)));
@@ -432,6 +436,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
arguments.epilogue.thread.norm_constant_ptr = block_Normconst.device_data();
}
+ arguments.scheduler.max_swizzle_size = options.swizzle;
return arguments;
}
diff --git a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
index 2784d050..1d6c1f3c 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
@@ -212,12 +212,14 @@ struct Options {
float alpha, beta;
int iterations;
int m, n, k;
+ int swizzle = 0;
Options():
help(false),
m(1024), n(1024), k(1024),
alpha(1.f), beta(0.f),
- iterations(10)
+ iterations(10),
+ swizzle(0)
{ }
// Parses the command line
@@ -235,6 +237,7 @@ struct Options {
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
@@ -248,7 +251,8 @@ struct Options {
<< " --n= Sets the N extent of the GEMM\n"
<< " --k= Sets the K extent of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
- << " --beta= Epilogue scalar beta\n\n"
+ << " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n"
<< " --iterations= Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
@@ -334,7 +338,7 @@ bool initialize_block(
void initialize(const Options &options) {
using namespace cute;
// For SFA and SFB tensors layouts
- using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
@@ -345,8 +349,8 @@ void initialize(const Options &options) {
layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
- layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
- layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
block_A.reset(cutlass::make_Coord(size(layout_A)));
block_B.reset(cutlass::make_Coord(size(layout_B)));
@@ -388,6 +392,7 @@ typename Gemm::Arguments args_from_options(const Options &options)
}
};
+ arguments.scheduler.max_swizzle_size = options.swizzle;
return arguments;
}
diff --git a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
index 19c4efd1..67b82a6e 100644
--- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
+++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
@@ -214,7 +214,8 @@ struct Options {
int iterations;
int m, n, k;
int preferred_cluster_m, preferred_cluster_n, fallback_cluster_m, fallback_cluster_n;
-
+ int swizzle = 0;
+
Options():
help(false),
m(4096), n(4096), k(4096),
@@ -223,7 +224,8 @@ struct Options {
preferred_cluster_m(4),
preferred_cluster_n(4),
fallback_cluster_m(2),
- fallback_cluster_n(1)
+ fallback_cluster_n(1),
+ swizzle(0)
{ }
// Parses the command line
@@ -245,6 +247,7 @@ struct Options {
cmd.get_cmd_line_argument("preferred_cluster_n", preferred_cluster_n, 4);
cmd.get_cmd_line_argument("fallback_cluster_m", fallback_cluster_m, 2);
cmd.get_cmd_line_argument("fallback_cluster_n", fallback_cluster_n, 1);
+ cmd.get_cmd_line_argument("swizzle", swizzle);
if (!validate_cluster_shape()){
std::cout << "--Invalid cluster shapes" << std::endl;
@@ -265,6 +268,7 @@ struct Options {
<< " --k= Sets the K extent of the GEMM\n"
<< " --alpha= Epilogue scalar alpha\n"
<< " --beta= Epilogue scalar beta\n"
+ << " --swizzle= Cluster rasterization swizzle\n"
<< " --preferred_cluster_m= Sets the M extent of preferred cluster shape\n"
<< " --preferred_cluster_n= Sets the N extent of preferred cluster shape\n"
<< " --fallback_cluster_m= Sets the M extent of fallback cluster shape\n"
@@ -384,7 +388,8 @@ typename Gemm::Arguments args_from_options(const Options &options) {
arguments.hw_info.cluster_shape = dim3(options.preferred_cluster_m, options.preferred_cluster_n,1);
arguments.hw_info.cluster_shape_fallback = dim3(options.fallback_cluster_m, options.fallback_cluster_n,1);
-
+
+ arguments.scheduler.max_swizzle_size = options.swizzle;
return arguments;
}
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
index 1d8db6e2..ad563a4b 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@@ -242,6 +242,7 @@ using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTil
struct Options {
bool help = false;
+ bool use_pdl = false;
float alpha = FLT_MAX;
float beta = FLT_MAX;
@@ -264,6 +265,9 @@ struct Options {
help = true;
return;
}
+ if (cmd.check_cmd_line_flag("use_pdl")) {
+ use_pdl = true;
+ }
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
@@ -387,7 +391,8 @@ struct Options {
<< " --raster= CTA Rasterization direction (N for along N, M for along M)\n\n"
<< " --iterations= Number of profiling iterations to perform\n\n"
<< " --benchmark= Executes a benchmark problem size\n"
- << " --max_sm_count= Run kernels using only these number of SMs\n";
+ << " --max_sm_count= Run kernels using only these number of SMs\n"
+ << " --use_pdl Launch kernel with PDL (Programmatic Dependent Launch) enabled\n";
out
<< "\n\nExamples:\n\n"
@@ -711,7 +716,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
- CUTLASS_CHECK(gemm.run());
+ CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
// Check if output from CUTLASS kernel and reference kernel are equal or not
Result result;
@@ -730,7 +735,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
- CUTLASS_CHECK(gemm.run());
+ CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
}
timer.stop();
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
index ee697135..d5814c0a 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@@ -219,14 +219,14 @@ using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
-using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
-using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
+using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
OutputSFVectorSize,
cute::is_same_v ? cute::UMMA::Major::K : cute::UMMA::Major::MN
>;
-using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
-using LayoutSFD = typename Sm100BlockScaledOutputConfig::LayoutSF;
+using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
// Host-side allocations
std::vector stride_A_host;
@@ -305,6 +305,7 @@ struct Options {
bool help = false;
bool verification = true;
+ bool use_pdl = false;
float alpha = FLT_MAX;
float beta = FLT_MAX;
@@ -328,9 +329,12 @@ struct Options {
help = true;
return;
}
- if (cmd.check_cmd_line_flag("no-verif")) {
+ if (cmd.check_cmd_line_flag("no_verif")) {
verification = false;
}
+ if (cmd.check_cmd_line_flag("use_pdl")) {
+ use_pdl = true;
+ }
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
@@ -457,7 +461,8 @@ struct Options {
<< " --iterations= Number of profiling iterations to perform\n\n"
<< " --benchmark= Executes a benchmark problem size\n"
<< " --max_sm_count= Run kernels using only these number of SMs\n"
- << " --no-verif Do not run (host-side) verification kernels\n";
+ << " --no_verif Do not run (host-side) verification kernels\n"
+ << " --use_pdl Launch kernel with PDL (Programmatic Dependent Launch) enabled\n";
out
<< "\n\nExamples:\n\n"
@@ -554,9 +559,9 @@ void allocate(const Options &options) {
auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
- auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
- auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
- auto layout_SFD = Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+ auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+ auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+ auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
stride_A_host.push_back(stride_A);
stride_B_host.push_back(stride_B);
@@ -775,9 +780,9 @@ bool verify(const Options &options) {
auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
- auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
- auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
- auto layout_SFD = Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
+ auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+ auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+ auto layout_SFD = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(cute::make_shape(M, N, K, 1));
// Create the arguments for host reference implementation
Tensor tensor_A = make_tensor(make_iterator(block_A.at(i).host_data()), layout_A);
@@ -845,7 +850,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
- CUTLASS_CHECK(gemm.run());
+ CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
cudaDeviceSynchronize();
@@ -870,7 +875,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
- CUTLASS_CHECK(gemm.run());
+ CUTLASS_CHECK(gemm.run(/* stream = */ nullptr, /* cuda_adapter = */ nullptr, /* launch_with_pdl = */ options.use_pdl));
}
timer.stop();
diff --git a/examples/77_blackwell_fmha/README.md b/examples/77_blackwell_fmha/README.md
index 8766f081..2f4c9c76 100644
--- a/examples/77_blackwell_fmha/README.md
+++ b/examples/77_blackwell_fmha/README.md
@@ -21,3 +21,35 @@ To modify the code for fusions, `collective/fmha_fusion.hpp` provides the easies
The `apply_mask` function is called with the accumulator of the first GEMM and the logical positions of those elements.
It is well-suited for applying masks or activations.
More complex fusions that require memory loads would require modifying the mainloop collective to orchestrate the load via TMA.
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
new file mode 100644
index 00000000..058c4b2b
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
@@ -0,0 +1,546 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+ \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+ This example demonstrates a simple way to instantiate and run a blockscaled NVFP4 GEMM on the NVIDIA Blackwell SM120 architecture.
+ This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+ The Blackwell SM120 CUTLASS kernel uses the new Block Scaled Tensor Core MMA Instructions (mma.sync.aligned.block_scale).
+ NVFP4 MMA has 2x throughput compared to MXFP8 MMA and 4x throughput compared to Ada Tensor Core FP8 MMA.
+ (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+
+ This kernel leverages:
+ 1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+ 2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+ 3. Block Scaled Tensor Core MMA Instructions
+ 4. Epilogue Optimization
+
+ Note that GeForce RTX 50 series GPUs do not support:
+ 1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+ 2. Dynamic datatypes.
+
+ Usage:
+
+ $ ./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using ElementA = cutlass::nv_float4_t; // Element type for A matrix operand
+using LayoutATag = cutlass::layout::RowMajor; // Layout type for A matrix operand
+constexpr int AlignmentA = 32; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = cutlass::nv_float4_t; // Element type for B matrix operand
+using LayoutBTag = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
+constexpr int AlignmentB = 32; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementD = cutlass::bfloat16_t; // Element type for D matrix operand
+using ElementC = cutlass::bfloat16_t; // Element type for C matrix operand
+using LayoutCTag = cutlass::layout::RowMajor; // Layout type for C matrix operand
+using LayoutDTag = cutlass::layout::RowMajor; // Layout type for D matrix operand
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator = float; // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm120; // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape = Shape<_128,_128,_128>; // Threadblock's tile size
+using ClusterShape = Shape<_1,_1,_1>; // Shape of the threadblocks in a cluster
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ThreadBlockShape, ClusterShape,
+ cutlass::epilogue::collective::EpilogueTileAuto,
+ ElementAccumulator, ElementAccumulator,
+ ElementC, LayoutCTag, AlignmentC,
+ ElementD, LayoutDTag, AlignmentD,
+ cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
+ >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ElementA, LayoutATag, AlignmentA,
+ ElementB, LayoutBTag, AlignmentB,
+ ElementAccumulator,
+ ThreadBlockShape, ClusterShape,
+ cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+ cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto defaults to cooperative kernel schedule
+ >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+ Shape, // Indicates ProblemShape
+ CollectiveMainloop,
+ CollectiveEpilogue,
+ void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter;
+
+// Reference device GEMM implementation type
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using LayoutA = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using LayoutB = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using LayoutC = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD = typename Gemm::GemmKernel::StrideD;
+using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor block_A;
+cutlass::HostTensor block_SFA;
+cutlass::HostTensor block_B;
+cutlass::HostTensor block_SFB;
+cutlass::HostTensor block_C;
+// Output Tensor
+cutlass::HostTensor block_D;
+// Reference Output Tensor
+cutlass::HostTensor block_reference_D;
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template
+auto make_iterator(T* ptr) {
+ using namespace cute;
+ if constexpr (cute::is_subbyte_v) {
+ return subbyte_iterator(ptr);
+ }
+ else {
+ return ptr;
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+
+ float alpha, beta;
+ int iterations;
+ int m, n, k;
+
+ Options():
+ help(false),
+ m(1024), n(1024), k(1024),
+ alpha(1.f), beta(0.f),
+ iterations(10)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m);
+ cmd.get_cmd_line_argument("n", n);
+ cmd.get_cmd_line_argument("k", k);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "79a_blackwell_geforce_nvfp4_bf16_gemm\n\n"
+ << " Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Number of profiling iterations to perform.\n\n";
+
+ out << "\n\nExamples:\n\n"
+ << "$ " << "./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+ return out;
+ }
+
+ /// Compute performance in GFLOP/s
+ double gflops(double runtime_s) const
+ {
+ // Two flops per multiply-add
+ uint64_t flop = uint64_t(2) * m * n * k;
+ double gflop = double(flop) / double(1.0e9);
+ return gflop / runtime_s;
+ }
+};
+
+/// Result structure
+struct Result
+{
+ double avg_runtime_ms;
+ double gflops;
+ cutlass::Status status;
+ cudaError_t error;
+ bool passed;
+
+ Result(
+ double avg_runtime_ms = 0,
+ double gflops = 0,
+ cutlass::Status status = cutlass::Status::kSuccess,
+ cudaError_t error = cudaSuccess)
+ :
+ avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+ {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template
+bool initialize_block(
+ cutlass::TensorView view,
+ uint64_t seed) {
+
+ double scope_max, scope_min;
+ constexpr int bits_input = cutlass::sizeof_bits::value;
+
+ if constexpr (bits_input == 1) {
+ scope_max = 2;
+ scope_min = 0;
+ }
+ else if constexpr (bits_input <= 6) {
+ scope_max = 2;
+ scope_min = -2;
+ }
+ else if constexpr (bits_input <= 8) {
+ if constexpr (cute::is_same_v) {
+ scope_max = 4;
+ scope_min = 1;
+ }
+ else {
+ scope_max = 1;
+ scope_min = -1;
+ }
+ }
+ else{
+ scope_max = 4;
+ scope_min = -4;
+ }
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min, 0);
+
+ return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+ using namespace cute;
+ // For SFA and SFB tensors layouts
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+ layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+ layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+ layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+ layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+
+ block_A.reset(cutlass::make_Coord(size(layout_A)));
+ block_B.reset(cutlass::make_Coord(size(layout_B)));
+ block_C.reset(cutlass::make_Coord(size(layout_C)));
+ block_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+ block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+
+ initialize_block(block_A.host_view(), seed + 2021);
+ initialize_block(block_B.host_view(), seed + 2022);
+ initialize_block(block_C.host_view(), seed + 2023);
+ initialize_block(block_SFA.host_view(), seed + 2024);
+ initialize_block(block_SFB.host_view(), seed + 2025);
+
+ block_A.sync_device();
+ block_B.sync_device();
+ block_C.sync_device();
+ block_SFA.sync_device();
+ block_SFB.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+ typename Gemm::Arguments arguments {
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ {options.m, options.n, options.k, 1},
+ { // Mainloop arguments
+ block_A.device_data(), stride_A,
+ block_B.device_data(), stride_B,
+ block_SFA.device_data(), layout_SFA,
+ block_SFB.device_data(), layout_SFB
+ },
+ { // Epilogue arguments
+ {options.alpha, options.beta},
+ block_C.device_data(), stride_C,
+ block_D.device_data(), stride_D
+ }
+ };
+
+ return arguments;
+}
+
+bool verify(const Options &options) {
+ using namespace cute;
+ // Create the arguments for host reference implementation
+ Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+ Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+ Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+ Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+ cutlass::reference::host::GettBlockScalingMainloopParams<
+ ElementAccumulator, // ElementAccumulator
+ decltype(tensor_A), // TensorA
+ decltype(tensor_SFA), // TensorSfA
+ decltype(tensor_B), // TensorB
+ decltype(tensor_SFB) // TensorSfB
+ > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+ auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+ auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+
+ cutlass::reference::host::GettBlockScalingEpilogueParams<
+ ElementAccumulator, // ElementScalar
+ ElementAccumulator, // ElementAccumulator
+ ElementAccumulator, // ElementCompute
+ decltype(tensor_C), // TensorC
+ decltype(tensor_D) // TensorD
+ > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
+
+ cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+ // Comparison
+ block_D.sync_host();
+ bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+ passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+ passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+ return passed;
+}
+
+/// Execute a given example GEMM computation
+template
+int run(Options &options)
+{
+ initialize(options);
+
+ // Instantiate CUTLASS kernel depending on templates
+ Gemm gemm;
+
+ // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+ auto arguments = args_from_options(options);
+
+ // Using the arguments, query for extra workspace required for matrix multiplication computation
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+ // Allocate workspace memory
+ cutlass::device_memory::allocation workspace(workspace_size);
+
+ // Check if the problem size is supported or not
+ CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ // Initialize CUTLASS kernel with arguments and workspace pointer
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+ // Correctness / Warmup iteration
+ CUTLASS_CHECK(gemm.run());
+
+ cudaDeviceSynchronize();
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ Result result;
+ result.passed = verify(options);
+
+ std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+ if (!result.passed) {
+ exit(-1);
+ }
+
+ // Run profiling loop
+ if (options.iterations > 0)
+ {
+ GpuTimer timer;
+ timer.start();
+ for (int iter = 0; iter < options.iterations; ++iter) {
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+ CUTLASS_CHECK(gemm.run());
+ }
+ timer.stop();
+
+ // Compute average runtime and GFLOPs.
+ float elapsed_ms = timer.elapsed_millis();
+ result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+ result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+ std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+ std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+ std::cout << " GFLOPS: " << result.gflops << std::endl;
+ }
+
+ return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+ // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+ // and must have compute capability at least 100.
+ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+ std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+ // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+ return 0;
+ }
+
+ cudaDeviceProp props;
+ int current_device_id;
+ CUDA_CHECK(cudaGetDevice(¤t_device_id));
+
+ CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+
+ if (!(props.major == 12 && props.minor == 0)) {
+ std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+ return 0;
+ }
+
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, args);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ //
+ // Evaluate CUTLASS kernels
+ //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+ run(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+ return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
new file mode 100644
index 00000000..e3ebba4a
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
@@ -0,0 +1,593 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+ \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+ This example demonstrates a simple way to instantiate and run a blockscaled NVFP4 GEMM on the NVIDIA Blackwell SM120 architecture.
+ The kernel outputs quantized fp4 values with scale factors that will be the input of another GEMM.
+ This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+ Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+
+ 1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+ 2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+ 3. Block Scaled Tensor Core MMA Instructions
+ 4. Epilogue Optimization
+
+ Note that GeForce RTX 50 series GPUs do not support:
+ 1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+ 2. Dynamic datatypes.
+
+ Usage:
+
+ $ ./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using ElementA = cutlass::nv_float4_t; // Element type for A matrix operand
+using LayoutATag = cutlass::layout::RowMajor; // Layout type for A matrix operand
+constexpr int AlignmentA = 32; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = cutlass::nv_float4_t; // Element type for B matrix operand
+using LayoutBTag = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
+constexpr int AlignmentB = 32; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementD = cutlass::float_e2m1_t; // Element type for D matrix operand
+using ElementSFD = cutlass::float_ue8m0_t; // Element type for SFD matrix operand
+using ElementC = cutlass::bfloat16_t; // Element type for C matrix operand
+using LayoutCTag = cutlass::layout::RowMajor; // Layout type for C matrix operand
+using LayoutDTag = cutlass::layout::RowMajor; // Layout type for D matrix operand
+using LayoutSFDTag = LayoutDTag; // Layout type for SFD should be same as D matrix operand
+
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator = float; // Element type for internal accumulation
+using ElementCompute = float; // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm120; // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape = Shape<_128,_128,_128>; // Threadblock's tile size
+using ClusterShape = Shape<_1,_1,_1>; // Shape of the threadblocks in a cluster
+
+constexpr int InputSFVectorSize = 16;
+constexpr int OutputSFVectorSize = InputSFVectorSize;
+
+// D = alpha * acc + beta * C
+// With BlockScaleFactor generation.
+using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+ OutputSFVectorSize,
+ ElementD,
+ ElementCompute,
+ ElementSFD, LayoutSFDTag,
+ ElementC>;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ThreadBlockShape, ClusterShape,
+ cutlass::epilogue::collective::EpilogueTileAuto,
+ ElementAccumulator, ElementAccumulator,
+ ElementC, LayoutCTag, AlignmentC,
+ ElementD, LayoutDTag, AlignmentD,
+ cutlass::epilogue::collective::EpilogueScheduleAuto, // Epilogue schedule policy
+ FusionOperation
+ >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ElementA, LayoutATag, AlignmentA,
+ ElementB, LayoutBTag, AlignmentB,
+ ElementAccumulator,
+ ThreadBlockShape, ClusterShape,
+ cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+ cutlass::gemm::KernelTmaWarpSpecializedPingpong // Ping-pong kernel schedule policy.
+ >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+ Shape, // Indicates ProblemShape
+ CollectiveMainloop,
+ CollectiveEpilogue,
+ void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter;
+
+// Reference device GEMM implementation type
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using LayoutA = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using LayoutB = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using LayoutC = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD = typename Gemm::GemmKernel::StrideD;
+using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+using FusionOp = typename Gemm::EpilogueOutputOp;
+constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
+using SfdOutputCfg = cutlass::detail::Sm1xxBlockScaledOutputConfig;
+using LayoutSFD = typename SfdOutputCfg::LayoutSF;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+LayoutSFD layout_SFD;
+
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor block_A;
+cutlass::HostTensor block_SFA;
+cutlass::HostTensor block_B;
+cutlass::HostTensor block_SFB;
+cutlass::HostTensor block_C;
+// Output Tensor
+cutlass::HostTensor block_D;
+cutlass::HostTensor block_SFD;
+
+// Reference Output Tensor
+cutlass::HostTensor block_reference_D;
+cutlass::HostTensor block_reference_SFD;
+// Matrix-wide normalization constant
+cutlass::HostTensor block_Normconst;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template
+auto make_iterator(T* ptr) {
+ using namespace cute;
+ if constexpr (cute::is_subbyte_v) {
+ return subbyte_iterator(ptr);
+ }
+ else {
+ return ptr;
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+
+ float alpha, beta;
+ int iterations;
+ int m, n, k;
+
+ Options():
+ help(false),
+ m(1024), n(1024), k(1024),
+ alpha(1.f), beta(0.f),
+ iterations(10)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m);
+ cmd.get_cmd_line_argument("n", n);
+ cmd.get_cmd_line_argument("k", k);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "79b_blackwell_geforce_nvfp4_nvfp4_gemm\n\n"
+ << " Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Number of profiling iterations to perform.\n\n";
+
+ out << "\n\nExamples:\n\n"
+ << "$ " << "./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+ return out;
+ }
+
+ /// Compute performance in GFLOP/s
+ double gflops(double runtime_s) const
+ {
+ // Two flops per multiply-add
+ uint64_t flop = uint64_t(2) * m * n * k;
+ double gflop = double(flop) / double(1.0e9);
+ return gflop / runtime_s;
+ }
+};
+
+/// Result structure
+struct Result
+{
+ double avg_runtime_ms;
+ double gflops;
+ cutlass::Status status;
+ cudaError_t error;
+ bool passed;
+
+ Result(
+ double avg_runtime_ms = 0,
+ double gflops = 0,
+ cutlass::Status status = cutlass::Status::kSuccess,
+ cudaError_t error = cudaSuccess)
+ :
+ avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+ {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template
+bool initialize_block(
+ cutlass::TensorView view,
+ uint64_t seed) {
+
+ double scope_max, scope_min;
+ constexpr int bits_input = cutlass::sizeof_bits::value;
+
+ if constexpr (bits_input == 1) {
+ scope_max = 2;
+ scope_min = 0;
+ }
+ else if constexpr (bits_input <= 6) {
+ scope_max = 2;
+ scope_min = -2;
+ }
+ else if constexpr (bits_input <= 8) {
+ if constexpr (cute::is_same_v) {
+ scope_max = 4;
+ scope_min = 1;
+ }
+ else {
+ scope_max = 1;
+ scope_min = -1;
+ }
+ }
+ else{
+ scope_max = 4;
+ scope_min = -4;
+ }
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min, 0);
+
+ return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+ using namespace cute;
+ // For SFA and SFB tensors layouts
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+ // For SFD tensor layout
+ using Sm1xxBlockScaledOutputConfig= typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+ layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+ layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+ layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+ layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFD = SfdOutputCfg::tile_atom_to_shape_SFD(cute::make_shape(options.m, options.n, options.k, 1));
+
+ block_A.reset(cutlass::make_Coord(size(layout_A)));
+ block_B.reset(cutlass::make_Coord(size(layout_B)));
+ block_C.reset(cutlass::make_Coord(size(layout_C)));
+ block_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_reference_SFD.reset(cutlass::make_Coord(size(filter_zeros(layout_SFD))));
+ block_Normconst.reset(cutlass::make_Coord(1));
+
+ block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+ block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+ block_SFD.reset(cutlass::make_Coord(size(filter_zeros(layout_SFD))));
+
+ initialize_block(block_A.host_view(), seed + 2021);
+ initialize_block(block_B.host_view(), seed + 2022);
+ initialize_block(block_C.host_view(), seed + 2023);
+ initialize_block(block_SFA.host_view(), seed + 2024);
+ initialize_block(block_SFB.host_view(), seed + 2025);
+ block_Normconst.at(cutlass::make_Coord(0)) = 2;
+
+ block_A.sync_device();
+ block_B.sync_device();
+ block_C.sync_device();
+ block_SFA.sync_device();
+ block_SFB.sync_device();
+ block_SFD.sync_device();
+ block_Normconst.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+ typename Gemm::Arguments arguments {
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ {options.m, options.n, options.k, 1},
+ { // Mainloop arguments
+ block_A.device_data(), stride_A,
+ block_B.device_data(), stride_B,
+ block_SFA.device_data(), layout_SFA,
+ block_SFB.device_data(), layout_SFB
+ },
+ { // Epilogue arguments
+ {options.alpha, options.beta},
+ block_C.device_data(), stride_C,
+ block_D.device_data(), stride_D
+ }
+ };
+
+ if constexpr (IsBlockScaleSupported) {
+ arguments.epilogue.thread.block_scale_factor_ptr = block_SFD.device_data();
+ arguments.epilogue.thread.norm_constant_ptr = block_Normconst.device_data();
+ }
+
+ return arguments;
+}
+
+bool verify(const Options &options) {
+ using namespace cute;
+ // Create the arguments for host reference implementation
+ Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+ Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+ Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+ Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+ cutlass::reference::host::GettBlockScalingMainloopParams<
+ ElementAccumulator, // ElementAccumulator
+ decltype(tensor_A), // TensorA
+ decltype(tensor_SFA), // TensorSfA
+ decltype(tensor_B), // TensorB
+ decltype(tensor_SFB) // TensorSfB
+ > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+ auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+ auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+ auto tensor_SFD = make_tensor(block_reference_SFD.host_data(), layout_SFD);
+
+ cutlass::reference::host::GettBlockScalingEpilogueParams<
+ ElementAccumulator, // ElementScalar
+ ElementAccumulator, // ElementAccumulator
+ ElementAccumulator, // ElementCompute
+ decltype(tensor_C), // TensorC
+ decltype(tensor_D), // TensorD
+ decltype(tensor_SFD), // TensorSfD
+ cute::Int,
+ cutlass::reference::host::SfStrategy::SfDGen
+ > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D, tensor_SFD, block_Normconst.at(cutlass::make_Coord(0))};
+
+ cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+ // Comparison
+ block_D.sync_host();
+ bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+ passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+ passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+ return passed;
+}
+
+/// Execute a given example GEMM computation
+template
+int run(Options &options)
+{
+ initialize(options);
+
+ // Instantiate CUTLASS kernel depending on templates
+ Gemm gemm;
+
+ // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+ auto arguments = args_from_options(options);
+
+ // Using the arguments, query for extra workspace required for matrix multiplication computation
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+ // Allocate workspace memory
+ cutlass::device_memory::allocation workspace(workspace_size);
+
+ // Check if the problem size is supported or not
+ CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ // Initialize CUTLASS kernel with arguments and workspace pointer
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+ // Correctness / Warmup iteration
+ CUTLASS_CHECK(gemm.run());
+
+ cudaDeviceSynchronize();
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ Result result;
+ result.passed = verify(options);
+
+ std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+ if (!result.passed) {
+ exit(-1);
+ }
+
+ // Run profiling loop
+ if (options.iterations > 0)
+ {
+ GpuTimer timer;
+ timer.start();
+ for (int iter = 0; iter < options.iterations; ++iter) {
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+ CUTLASS_CHECK(gemm.run());
+ }
+ timer.stop();
+
+ // Compute average runtime and GFLOPs.
+ float elapsed_ms = timer.elapsed_millis();
+ result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+ result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+ std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+ std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+ std::cout << " GFLOPS: " << result.gflops << std::endl;
+ }
+
+ return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+ // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+ // and must have compute capability at least 100.
+ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+ std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+ // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+ return 0;
+ }
+
+ cudaDeviceProp props;
+ int current_device_id;
+ CUDA_CHECK(cudaGetDevice(¤t_device_id));
+
+ CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+
+ if (!(props.major == 12 && props.minor == 0)) {
+ std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+ return 0;
+ }
+
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, args);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ //
+ // Evaluate CUTLASS kernels
+ //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+ run(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+ return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
new file mode 100644
index 00000000..ac2f39c9
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
@@ -0,0 +1,546 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+ \brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM120 architecture.
+
+ This example demonstrates a simple way to instantiate and run a mixed precision blockscaled GEMM on the NVIDIA Blackwell SM120 architecture.
+ This kernel is optimized for the GeForce RTX 50 series GPUs.
+
+ The Blackwell SM120 CUTLASS kernel uses the new Block Scaled Tensor Core MMA Instructions (mma.sync.aligned.block_scale).
+ MXFP8 MMA has 2x throughput compared to Ada Tensor Core FP8 MMA.
+ (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+
+ Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
+ 1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
+ 2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
+ 3. Block Scaled Tensor Core MMA Instructions
+ 4. Epilogue Optimization
+
+ Note that GeForce RTX 50 series GPUs do not support:
+ 1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
+ 2. Dynamic datatypes.
+
+ Usage:
+
+ $ ./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_bf16_gemm --m=2048 --n=2048 --k=2048
+*/
+
+#include
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+
+#include
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using ElementA = cutlass::mx_float8_t; // Element type for A matrix operand
+using LayoutATag = cutlass::layout::RowMajor; // Layout type for A matrix operand
+constexpr int AlignmentA = 16; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = cutlass::mx_float6_t; // Element type for B matrix operand
+using LayoutBTag = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
+constexpr int AlignmentB = 128; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using ElementD = cutlass::bfloat16_t; // Element type for D matrix operand
+using ElementC = cutlass::bfloat16_t; // Element type for C matrix operand
+using LayoutCTag = cutlass::layout::RowMajor; // Layout type for C matrix operand
+using LayoutDTag = cutlass::layout::RowMajor; // Layout type for D matrix operand
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// Kernel functional config
+using ElementAccumulator = float; // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm120; // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
+
+// Kernel Perf config
+using ThreadBlockShape = Shape<_128,_128,_128>; // Threadblock's tile size
+using ClusterShape = Shape<_1,_1,_1>; // Shape of the threadblocks in a cluster
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ThreadBlockShape, ClusterShape,
+ cutlass::epilogue::collective::EpilogueTileAuto,
+ ElementAccumulator, ElementAccumulator,
+ ElementC, LayoutCTag, AlignmentC,
+ ElementD, LayoutDTag, AlignmentD,
+ cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
+ >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ ElementA, LayoutATag, AlignmentA,
+ ElementB, LayoutBTag, AlignmentB,
+ ElementAccumulator,
+ ThreadBlockShape, ClusterShape,
+ cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+ cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto defaults to cooperative kernel schedule
+ >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+ Shape, // Indicates ProblemShape
+ CollectiveMainloop,
+ CollectiveEpilogue,
+ void>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter;
+
+// Reference device GEMM implementation type
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using LayoutA = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
+using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using LayoutB = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
+using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using LayoutC = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
+using StrideD = typename Gemm::GemmKernel::StrideD;
+using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+LayoutA layout_A;
+LayoutSFA layout_SFA;
+StrideB stride_B;
+LayoutB layout_B;
+LayoutSFB layout_SFB;
+StrideC stride_C;
+LayoutC layout_C;
+StrideD stride_D;
+LayoutD layout_D;
+uint64_t seed;
+
+// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
+// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
+cutlass::HostTensor block_A;
+cutlass::HostTensor block_SFA;
+cutlass::HostTensor block_B;
+cutlass::HostTensor block_SFB;
+cutlass::HostTensor block_C;
+// Output Tensor
+cutlass::HostTensor block_D;
+// Reference Output Tensor
+cutlass::HostTensor block_reference_D;
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+template
+auto make_iterator(T* ptr) {
+ using namespace cute;
+ if constexpr (cute::is_subbyte_v) {
+ return subbyte_iterator(ptr);
+ }
+ else {
+ return ptr;
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+
+ float alpha, beta;
+ int iterations;
+ int m, n, k;
+
+ Options():
+ help(false),
+ m(1024), n(1024), k(1024),
+ alpha(1.f), beta(0.f),
+ iterations(10)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m);
+ cmd.get_cmd_line_argument("n", n);
+ cmd.get_cmd_line_argument("k", k);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "79c_blackwell_geforce_mixed_mxfp8_bf16_gemm\n\n"
+ << " Blackwell NVFP4 GEMM using a Warp Specialized kernel.\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Number of profiling iterations to perform.\n\n";
+
+ out << "\n\nExamples:\n\n"
+ << "$ " << "./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_bf16_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+ return out;
+ }
+
+ /// Compute performance in GFLOP/s
+ double gflops(double runtime_s) const
+ {
+ // Two flops per multiply-add
+ uint64_t flop = uint64_t(2) * m * n * k;
+ double gflop = double(flop) / double(1.0e9);
+ return gflop / runtime_s;
+ }
+};
+
+/// Result structure
+struct Result
+{
+ double avg_runtime_ms;
+ double gflops;
+ cutlass::Status status;
+ cudaError_t error;
+ bool passed;
+
+ Result(
+ double avg_runtime_ms = 0,
+ double gflops = 0,
+ cutlass::Status status = cutlass::Status::kSuccess,
+ cudaError_t error = cudaSuccess)
+ :
+ avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+ {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template
+bool initialize_block(
+ cutlass::TensorView view,
+ uint64_t seed) {
+
+ double scope_max, scope_min;
+ constexpr int bits_input = cutlass::sizeof_bits::value;
+
+ if constexpr (bits_input == 1) {
+ scope_max = 2;
+ scope_min = 0;
+ }
+ else if constexpr (bits_input <= 6) {
+ scope_max = 2;
+ scope_min = -2;
+ }
+ else if constexpr (bits_input <= 8) {
+ if constexpr (cute::is_same_v) {
+ scope_max = 4;
+ scope_min = 1;
+ }
+ else {
+ scope_max = 1;
+ scope_min = -1;
+ }
+ }
+ else{
+ scope_max = 4;
+ scope_min = -4;
+ }
+ cutlass::reference::host::TensorFillRandomUniform(
+ view, seed, scope_max, scope_min, 0);
+
+ return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+ using namespace cute;
+ // For SFA and SFB tensors layouts
+ using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+ layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
+ layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
+ layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
+ layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
+ layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
+ layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
+
+ block_A.reset(cutlass::make_Coord(size(layout_A)));
+ block_B.reset(cutlass::make_Coord(size(layout_B)));
+ block_C.reset(cutlass::make_Coord(size(layout_C)));
+ block_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
+ block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
+ block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
+
+ initialize_block(block_A.host_view(), seed + 2021);
+ initialize_block(block_B.host_view(), seed + 2022);
+ initialize_block(block_C.host_view(), seed + 2023);
+ initialize_block(block_SFA.host_view(), seed + 2024);
+ initialize_block(block_SFB.host_view(), seed + 2025);
+
+ block_A.sync_device();
+ block_B.sync_device();
+ block_C.sync_device();
+ block_SFA.sync_device();
+ block_SFB.sync_device();
+}
+
+// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+ typename Gemm::Arguments arguments {
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ {options.m, options.n, options.k, 1},
+ { // Mainloop arguments
+ block_A.device_data(), stride_A,
+ block_B.device_data(), stride_B,
+ block_SFA.device_data(), layout_SFA,
+ block_SFB.device_data(), layout_SFB
+ },
+ { // Epilogue arguments
+ {options.alpha, options.beta},
+ block_C.device_data(), stride_C,
+ block_D.device_data(), stride_D
+ }
+ };
+
+ return arguments;
+}
+
+bool verify(const Options &options) {
+ using namespace cute;
+ // Create the arguments for host reference implementation
+ Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
+ Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
+ Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
+ Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
+
+ cutlass::reference::host::GettBlockScalingMainloopParams<
+ ElementAccumulator, // ElementAccumulator
+ decltype(tensor_A), // TensorA
+ decltype(tensor_SFA), // TensorSfA
+ decltype(tensor_B), // TensorB
+ decltype(tensor_SFB) // TensorSfB
+ > mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
+
+ auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
+ auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
+
+ cutlass::reference::host::GettBlockScalingEpilogueParams<
+ ElementAccumulator, // ElementScalar
+ ElementAccumulator, // ElementAccumulator
+ ElementAccumulator, // ElementCompute
+ decltype(tensor_C), // TensorC
+ decltype(tensor_D) // TensorD
+ > epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
+
+ cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+ // Comparison
+ block_D.sync_host();
+ bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
+ passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
+ passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
+
+ return passed;
+}
+
+/// Execute a given example GEMM computation
+template
+int run(Options &options)
+{
+ initialize(options);
+
+ // Instantiate CUTLASS kernel depending on templates
+ Gemm gemm;
+
+ // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+ auto arguments = args_from_options(options);
+
+ // Using the arguments, query for extra workspace required for matrix multiplication computation
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+ // Allocate workspace memory
+ cutlass::device_memory::allocation workspace(workspace_size);
+
+ // Check if the problem size is supported or not
+ CUTLASS_CHECK(gemm.can_implement(arguments));
+
+ // Initialize CUTLASS kernel with arguments and workspace pointer
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+ // Correctness / Warmup iteration
+ CUTLASS_CHECK(gemm.run());
+
+ cudaDeviceSynchronize();
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ Result result;
+ result.passed = verify(options);
+
+ std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+ if (!result.passed) {
+ exit(-1);
+ }
+
+ // Run profiling loop
+ if (options.iterations > 0)
+ {
+ GpuTimer timer;
+ timer.start();
+ for (int iter = 0; iter < options.iterations; ++iter) {
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+ CUTLASS_CHECK(gemm.run());
+ }
+ timer.stop();
+
+ // Compute average runtime and GFLOPs.
+ float elapsed_ms = timer.elapsed_millis();
+ result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+ result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+
+ std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+ std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+ std::cout << " GFLOPS: " << result.gflops << std::endl;
+ }
+
+ return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+ // CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
+ // and must have compute capability at least 100.
+ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
+ std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
+ // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+ return 0;
+ }
+
+ cudaDeviceProp props;
+ int current_device_id;
+ CUDA_CHECK(cudaGetDevice(¤t_device_id));
+
+ CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+
+ if (!(props.major == 12 && props.minor == 0)) {
+ std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
+ return 0;
+ }
+
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, args);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ //
+ // Evaluate CUTLASS kernels
+ //
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+ run(options);
+#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+
+ return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/79_blackwell_geforce_gemm/CMakeLists.txt b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
new file mode 100644
index 00000000..cb7e3e97
--- /dev/null
+++ b/examples/79_blackwell_geforce_gemm/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+if (CUTLASS_NVCC_ARCHS MATCHES 120a)
+cutlass_example_add_executable(
+ 79a_blackwell_geforce_nvfp4_bf16_gemm
+ 79a_blackwell_geforce_nvfp4_bf16_gemm.cu
+)
+
+cutlass_example_add_executable(
+ 79b_blackwell_geforce_nvfp4_nvfp4_gemm
+ 79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
+)
+
+cutlass_example_add_executable(
+ 79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm
+ 79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
+)
+
+endif()
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
index 417830f2..3148d2aa 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
@@ -216,7 +216,7 @@ struct Options {
out
<< "\n\nExamples:\n\n"
- << "$ " << "81_blackwell_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+ << "$ " << "112_blackwell_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
return out;
}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a1a5c00a..0f03cd9b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -157,6 +157,7 @@ foreach(EXAMPLE
76_blackwell_conv
77_blackwell_fmha
78_blackwell_emulated_bf16x9_gemm
+ 79_blackwell_geforce_gemm
81_blackwell_gemm_blockwise
)
diff --git a/examples/README.md b/examples/README.md
index 68bf7077..92779c07 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -282,6 +282,10 @@
Blackwell SM100 FastFP32 (using BF16 to emulate SGEMM) kernel
+* [79_blackwell_geforce_gemm](79_blackwell_geforce_gemm/)
+
+ Blackwell SM120 MMA kernel targeting GeForce RTX 50 series CUDA Cores
+
# CuTe - Programming Examples
Examples that do not rely on CUTLASS and directly showcase the features of CuTe are located in [cutlass/examples/cute](./cute/).
@@ -291,3 +295,35 @@ Additionally, CuTe's core layout and layout algebra have their own test cases wi
# Python Interface Examples
Examples leveraging CUTLASS's [Python interface](../python/README.md) are located in [cutlass/examples/python](python/).
+
+# Copyright
+
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/common/gather_tensor.hpp b/examples/common/gather_tensor.hpp
index 67ae811b..46fb6400 100644
--- a/examples/common/gather_tensor.hpp
+++ b/examples/common/gather_tensor.hpp
@@ -58,7 +58,7 @@ struct IndexedGather
operator()(I i) const { return indices_[i]; }
CUTE_HOST_DEVICE friend
- void
+ void
print(IndexedGather const &s) {
cute::print("Indexed");
}
@@ -80,7 +80,7 @@ struct StridedGather
operator()(I i) const { return i * stride_; }
CUTE_HOST_DEVICE friend
- void
+ void
print(StridedGather const &s) {
cute::print("Strided{");
print(s.stride_);
@@ -153,7 +153,7 @@ make_custom_stride_layout(Stride const &stride, Func&& func)
/// Helper function to optionally create a gather tensor
template
CUTLASS_HOST_DEVICE
-auto
+auto
make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
{
if constexpr (not cutlass::platform::is_same, NoGather>::value) {
@@ -180,7 +180,7 @@ upcast(Shape const& shape, Stride const& stride)
return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast(s,d); });
} else if constexpr (is_scaled_basis::value) {
if constexpr (Stride::mode() == I) {
- return make_layout(shape_div(shape, Int{}), shape_div(stride, Int{}));
+ return make_layout(ceil_div(shape, Int{}), ceil_div(stride, Int{}));
} else {
return make_layout(shape, stride);
}
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
index f263e5ce..3c9e93c4 100644
--- a/examples/cute/tutorial/CMakeLists.txt
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -27,34 +27,31 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+add_subdirectory(hopper)
+add_subdirectory(blackwell)
cutlass_example_add_executable(
- sgemm_1
+ cute_tutorial_sgemm_1
sgemm_1.cu
)
cutlass_example_add_executable(
- sgemm_2
+ cute_tutorial_sgemm_2
sgemm_2.cu
)
cutlass_example_add_executable(
- sgemm_sm70
+ cute_tutorial_sgemm_sm70
sgemm_sm70.cu
)
cutlass_example_add_executable(
- sgemm_sm80
+ cute_tutorial_sgemm_sm80
sgemm_sm80.cu
)
cutlass_example_add_executable(
- tiled_copy
+ cute_tutorial_tiled_copy
tiled_copy.cu
)
-cutlass_example_add_executable(
- wgmma_sm90
- wgmma_sm90.cu
-)
-
diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
new file mode 100644
index 00000000..3f73140a
--- /dev/null
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// CuTe Tutorial for SM100 Programming
+// This tutorial series demonstrates CuTe Blackwell capabilities that are frequently used
+// throughout CUTLASS. The goal is to familiarize developers with CuTe SM100 interfaces.
+//
+// The tutorial series is split into five stages:
+// * 01_mma_sm100.cu: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction.
+// * 02_mma_tma_sm100.cu: Simple Blackwell SM100 GEMM using tcgen05.mma and TMA instructions.
+// * 03_mma_tma_multicast_sm100.cu: Blackwell SM100 GEMM using tcgen05.mma and Multicast TMA.
+// * 04_mma_tma_2sm_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma and 2SM Multicast TMA.
+// * 05_mma_tma_epi_sm100.cu: Blackwell SM100 GEMM with 2SM tcgen05.mma, 2SM TMA mainloop, and TMA epilogue.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include
+#include
+
+// Use Thrust to handle host/device allocations
+#include
+#include
+
+// Cutlass includes
+#include // F16 data type
+#include
+#include
+#include
+
+// CuTe includes
+#include // CuTe tensor implementation
+#include // CuTe functions for querying the details of cluster launched
+#include // Compile time in constants such as _1, _256 etc.
+#include
+
+// Tutorial helpers
+#include "example_utils.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tutorial 01: Simple Blackwell SM100 GEMM using a tcgen05.mma instruction
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The goal of this tutorial is to show the CuTe interface for tcgen05.mma and tcgen05.ld operations.
+// We will implement a GEMM operation: D (f32) = beta * C (F32) + alpha * A (F16) * B (F16) where:
+// - Matrix A is MxK, K-major (BLAS transpose T, row-major)
+// - Matrix B is NxK, K-major (BLAS transpose N, column-major)
+// - Matrices C and D are MxN, N-major (BLAS row-major)
+//
+// This GEMM kernel performs the following steps:
+// 1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) for one MmaTile
+// using auto-vectorizing copy operations.
+// 2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+// 3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+// 4. Read C matrix from global memory (GMEM) to register (RMEM).
+// 5. Apply alpha and beta scaling to the MMA accumulator and C matrix.
+// 6. Store D matrix from registers (RMEM) to global memory (GMEM).
+//
+// SM100 tcgen05.mma instructions operate as follows:
+// - Read matrix A from SMEM or TMEM
+// - Read matrix B from SMEM
+// - Write accumulator to TMEM
+// The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+//
+// The tcgen05.mma instruction requires an Instruction Descriptor that encodes A, B, and Accumulator types
+// and the MMA's M and N dimensions.
+// The A and B matrices that are read from SMEM need to be provided to MMA instructions as SMEM Descriptors.
+// These are the A and B fragments of the tcgen05.mma in CuTe terminology.
+// CuTe provides these descriptors transparently in the instruction and fragments, shown in this tutorial.
+//
+// The MMA details:
+// We use the tcgen05.mma.f16 instruction (F16xF16 = F32) that performs a 128x256x16 MMA
+// operation. F32 accumulator type is chosen since both C and D matrices use F32.
+// This example uses F16xF16 = F32 MMA where:
+// TypeA = cutlass::half_t; // MMA A Data Type
+// TypeB = cutlass::half_t; // MMA B Data Type
+// TypeC = float; // MMA C Data Type
+// TypeD = float; // MMA D Data Type
+// TypeAccumulator = float; // Both TypeC and TypeD are float, so we use float accumulator type
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// The shared memory buffers for A and B matrices.
+template // (MmaB, NumMma_N, NumMma_K, ...)
+struct SharedStorage
+{
+ alignas(128) cute::ArrayEngine> A;
+ alignas(128) cute::ArrayEngine> B;
+
+ alignas(16) cute::uint64_t mma_barrier; // Barrier to track MMA computation on SMEM
+
+ CUTE_DEVICE constexpr auto tensor_sA() { return make_tensor(make_smem_ptr(A.begin()), ASmemLayout{}); }
+ CUTE_DEVICE constexpr auto tensor_sB() { return make_tensor(make_smem_ptr(B.begin()), BSmemLayout{}); }
+};
+
+// The device kernel
+template
+__global__ static
+void
+gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
+ BTensor mB, // (Gemm_N, Gemm_K)
+ CTensor mC, // (Gemm_M, Gemm_N)
+ DTensor mD, // (Gemm_M, Gemm_N)
+ MmaTiler_MNK mma_tiler, //
+ TiledMMA tiled_mma, // < Mma_M, Mma_N, Mma_K>
+ ClusterShape_MNK cluster_shape, // (ClusterM, ClusterN, ClusterK)
+ Alpha alpha, Beta beta)
+{
+ // Step 1: The Prologue.
+
+ // The CTA layout within the Cluster: (V,M,N,K) -> CTA idx
+ Layout cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape),
+ make_tile(typename TiledMMA::AtomThrID{}));
+
+ // Construct the MMA grid coordinate from the CTA grid coordinate
+ auto mma_coord_vmnk = make_coord(blockIdx.x % size<0>(cluster_layout_vmnk), // Peer CTA coordinate
+ blockIdx.x / size<0>(cluster_layout_vmnk), // MMA-M coordinate
+ blockIdx.y, // MMA-N coordinate
+ _); // MMA-K coordinate
+
+ // Partition the GMEM tensors with the mma_tiler and mma_coord to get the slices processed
+ // by this mma tile.
+ // CuTe provides local_tile partitioning function. local_tile accepts 4 parameters:
+ // * Tensor to partition
+ // * Tiler to use for partitioning
+ // * Coordinate to use for slicing the partitioned tensor
+ // * Projection to ignore unwanted modes of the Tiler and Coordinate
+ auto mma_coord = select<1,2,3>(mma_coord_vmnk);
+ Tensor gA = local_tile(mA, mma_tiler, mma_coord, Step<_1, X,_1>{}); // (MmaTile_M, MmaTile_K, Tiles_K)
+ Tensor gB = local_tile(mB, mma_tiler, mma_coord, Step< X,_1,_1>{}); // (MmaTile_N, MmaTile_K, Tiles_K)
+ Tensor gC = local_tile(mC, mma_tiler, mma_coord, Step<_1,_1, X>{}); // (MmaTile_M, MmaTile_N)
+ Tensor gD = local_tile(mD, mma_tiler, mma_coord, Step<_1,_1, X>{}); // (MmaTile_M, MmaTile_N)
+
+ if (thread0()) {
+ print("mA:\t"); print(mA); print("\n"); // mA: gmem_ptr[16b](GMEM_ADDR_A) o (512,256):(256,_1)
+ print("mB:\t"); print(mB); print("\n"); // mB: gmem_ptr[16b](GMEM_ADDR_B) o (1024,256):(256,_1)
+ print("mC:\t"); print(mC); print("\n"); // mC: gmem_ptr[32b](GMEM_ADDR_C) o (512,1024):(1024,_1)
+ print("mD:\t"); print(mD); print("\n"); // mD: gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
+
+ print("gA:\t"); print(gA); print("\n"); // gA: gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile) o (_128,_64,4):(256,_1,_64)
+ print("gB:\t"); print(gB); print("\n"); // gB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile) o (_256,_64,4):(_1,256,16384)
+ print("gC:\t"); print(gC); print("\n"); // gC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256,_1)
+ print("gD:\t"); print(gD); print("\n"); // gD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256,_1)
+ } __syncthreads();
+
+ // The SMEM tensors
+
+ // Allocate SMEM
+ extern __shared__ char shared_memory[];
+ SharedStorage& shared_storage = *reinterpret_cast(shared_memory);
+
+ // Represent the SMEM buffers for A and B
+ Tensor tCsA = shared_storage.tensor_sA(); // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+ Tensor tCsB = shared_storage.tensor_sB(); // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+ //
+ // Mma partitioning for A and B
+ //
+ // Note: Partitioned tensors use tXgY naming convention:
+ // tXgY -> The partitioning pattern tX applied to tensor gY
+
+ auto mma_v = get<0>(mma_coord_vmnk);
+ ThrMMA cta_mma = tiled_mma.get_slice(mma_v); // Use Peer CTA coordinate
+ Tensor tCgA = cta_mma.partition_A(gA); // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+ Tensor tCgB = cta_mma.partition_B(gB); // (MmaB, NumMma_N, NumMma_K, Tiles_K)
+ Tensor tCgC = cta_mma.partition_C(gC); // (MmaC, NumMma_M, NumMma_N)
+ Tensor tCgD = cta_mma.partition_C(gD); // (MmaC, NumMma_M, NumMma_N)
+
+ if (thread0()) {
+ print("tCgA:\t"); print(tCgA); print("\n"); // tCgA: gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile + offset_for_mma) o ((_128,_16),_1,_4,4):((256,_1),_0,_16,_64)
+ print("tCgB:\t"); print(tCgB); print("\n"); // tCgB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile + offset_for_mma) o ((_256,_16),_1,_4,4):((_1,256),_0,4096,16384)
+ print("tCgC:\t"); print(tCgC); print("\n"); // tCgC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+ print("tCgD:\t"); print(tCgD); print("\n"); // tCgD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256,_1),_0,_0)
+ } __syncthreads();
+
+ // MMA Fragment Allocation
+ // We allocate "fragments" which are SMEM descriptors that serve as inputs to cute::gemm operations.
+ // For tcgen05.mma operations:
+ // - Matrices A and B are sourced from SMEM
+ // - tCrA and tCrB provide descriptor views of tCsA and tCsB respectively
+ // - The first mode of each descriptor represents the SMEM for a single MMA operation
+ Tensor tCrA = cta_mma.make_fragment_A(tCsA); // (MmaA, NumMma_M, NumMma_K, Tiles_K)
+ Tensor tCrB = cta_mma.make_fragment_B(tCsB); // (MmaB, NumMma_M, NumMma_K, Tiles_K)
+
+ // TMEM Allocation
+ // On SM100 architecture, accumulators are stored exclusively in tensor memory (TMEM).
+ // ThrMma's make_fragment_C() creates a TMEM tensor with the appropriate layout for the accumulator.
+ Tensor tCtAcc = cta_mma.make_fragment_C(tCgC); // (MmaC, NumMma_M, NumMma_N)
+
+ if (thread0()) {
+ print("tCsA:\t"); print(tCsA); print("\n"); // tCsA: Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_A) o ((_128,_16),_1,_4):((_64,_1),_0,_16)
+ print("tCsB:\t"); print(tCsB); print("\n"); // tCsB: Sw<3,4,3>_smem_ptr[16b](SMEM_ADDR_B) o ((_256,_16),_1,_4):((_64,_1),_0,_16)
+ print("tCrA:\t"); print(tCrA); print("\n"); // tCrA: UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+ print("tCrB:\t"); print(tCrB); print("\n"); // tCrB: UMMA::DescriptorIterator o (_1,_1,_4):(_0,_0,_2)
+ print("tCtAcc:\t"); print(tCtAcc); print("\n"); // tCtAcc: tmem_[32b](TMEM_ADDR) o ((_128,_256),_1,_1):((_65536,_1),_0,_0)
+ } __syncthreads();
+
+ // Barrier Initialization
+ uint32_t elect_one_thr = cute::elect_one_sync();
+ uint32_t elect_one_warp = (threadIdx.x / 32 == 0);
+
+ // Barriers in SMEM initialized by a single thread.
+ if (elect_one_warp && elect_one_thr) {
+ cute::initialize_barrier(shared_storage.mma_barrier, /* num_ctas */ 1);
+ }
+ int mma_barrier_phase_bit = 0; // Each barrier has an associated phase_bit.
+ __syncthreads(); // Make sure all threads observe barrier initialization.
+
+ // Step 2: The Mainloop.
+
+ // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+ tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+ // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
+ for (int k_tile = 0; k_tile < size<3>(tCgA); ++k_tile)
+ {
+ // Step 2a: Load A and B tiles
+
+ // Using auto-vectorized copy operation:
+ // - Utilizes 128 threads for parallel data transfer
+ // - Copy operations are distributed efficiently across all threads
+ // - CuTe can automatically determine optimal vector width
+ cooperative_copy<128>(threadIdx.x, tCgA(_,_,_,k_tile), tCsA); // Load MmaTile_M x MmaTile_K A tile
+ cooperative_copy<128>(threadIdx.x, tCgB(_,_,_,k_tile), tCsB); // Load MmaTile_N x MmaTile_K B tile
+
+ // Step 2b: Execute the MMAs for this tile
+
+ // Wait for loads to SMEM to complete with __syncthreads()
+ __syncthreads();
+
+ // tcgen05.mma instructions require single-thread execution:
+ // - Only one warp performs the MMA-related loop operations
+ // - CuTe operations internally manage the single-thread execution of tcgen05.mma and tcgen05.cp
+ // - No explicit elect_one_sync region is needed from the user
+ if (elect_one_warp) {
+ // Execute a MmaTile_M x MmaTile_N x MmaTile_K GEMM
+ for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+ gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCtAcc);
+ tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+ }
+ // Ensure MMAs are completed, only then we can reuse the A and B SMEM.
+ cutlass::arch::umma_arrive(&shared_storage.mma_barrier);
+ }
+ // Wait MMAs to complete to avoid overwriting the A and B SMEM.
+ cute::wait_barrier(shared_storage.mma_barrier, mma_barrier_phase_bit);
+ mma_barrier_phase_bit ^= 1;
+ }
+
+ // Step 3: The Epilogue.
+
+ // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+ TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
+ ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x);
+
+ Tensor tDgC = thr_t2r_copy.partition_D(tCgC); // (CpyD, NumCpy_M, NumCpy_N)
+ Tensor tDrC = make_fragment_like(tDgC); // (CpyD, NumCpy_M, NumCpy_N)
+ // Load C tensor GMEM -> RMEM
+ copy(tDgC, tDrC);
+
+ Tensor tDtAcc = thr_t2r_copy.partition_S(tCtAcc); // (CpyS, NumCpy_M, NumCpy_N)
+ Tensor tDgD = thr_t2r_copy.partition_D(tCgD); // (CpyD, NumCpy_M, NumCpy_N)
+ using AccType = typename decltype(tCtAcc)::value_type;
+ Tensor tDrAcc = make_tensor