v4.2 release. (#2587)

* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line.

* v4.2 release.
This commit is contained in:
Junkai-Wu
2025-08-23 06:11:24 +08:00
committed by GitHub
parent 11cad1f67b
commit a49a78ffef
351 changed files with 28182 additions and 2032 deletions

View File

@ -659,7 +659,7 @@ struct Testbed {
}
int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
int64_t bytes = cutlass::bits_to_bytes(
int64_t bytes = cutlass::bits_to_bytes<int64_t>(
(cutlass::sizeof_bits<ElementD>::value * 2 + cutlass::sizeof_bits<ElementSoftmax>::value) *
options.problem_size.m() * options.problem_size.n());

View File

@ -33,8 +33,8 @@
computing reference permutations of 4/5D tensors when source data is column-major.
*/
#pragma once
#include <cuda/std/cassert>
#include "cutlass/cutlass.h"
#include CUDA_STD_HEADER(cassert)
#include "cutlass/layout/pitch_linear.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/coord.h"

View File

@ -40,14 +40,12 @@
Note that in general the fragment passed to the OutputOp could
span multiple rows but it does not happen with the configurations we have
*/
#pragma once
#include <cuda/std/cassert>
#include "cutlass/aligned_buffer.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include CUDA_STD_HEADER(cassert)
#include "cutlass/functional.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/layout/vector.h"

View File

@ -42,12 +42,10 @@
*/
#pragma once
#include <cuda/std/cassert>
#include "cutlass/cutlass.h"
#include CUDA_STD_HEADER(cassert)
#include "cutlass/aligned_buffer.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/functional.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/layout/vector.h"

View File

@ -38,10 +38,8 @@
*/
#pragma once
#include <cuda/std/cassert>
#include "cutlass/cutlass.h"
#include CUDA_STD_HEADER(cassert)
#include "cutlass/numeric_types.h"
#include "cutlass/array.h"
#include "cutlass/layout/vector.h"

View File

@ -37,12 +37,10 @@
*/
#pragma once
#include <cuda/std/cassert>
#include "cutlass/array.h"
#include CUDA_STD_HEADER(cassert)
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/array.h"
#include "cutlass/layout/vector.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/tensor_coord.h"

View File

@ -132,7 +132,7 @@ constexpr int ScaleGranularityK = 128;
constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand

View File

@ -142,7 +142,7 @@ static constexpr int ScaleGranularityK = 128;
static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand

View File

@ -454,11 +454,12 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 || props.minor != 0) {
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
return 0;
}
}
//
// Parse options
//

View File

@ -640,11 +640,11 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 || props.minor != 0) {
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
return 0;
}
}
//
// Parse options

View File

@ -33,7 +33,7 @@ set(TEST_SWIZZLE_2 --swizzle=2)
set(TEST_SWIZZLE_5 --swizzle=5)
set(TEST_SWIZZLE_5_UNEVEN --swizzle=5 --m=4096 --n=16384)
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
cutlass_example_add_executable(
70_blackwell_fp16_gemm
70_blackwell_fp16_gemm.cu

View File

@ -449,9 +449,9 @@ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MIN
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
if (!(props.major == 10 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
if (props.major != 10 || props.minor != 0) {
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
return 0;
}

View File

@ -27,7 +27,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Both filenames are shorter to avoid MAX_PATH issues on Windows.
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
cutlass_example_add_executable(
71_blackwell_gemm_with_collective_builder
71_blackwell_gemm_with_collective_builder.cu

View File

@ -116,7 +116,7 @@ using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // O
// Kernel Perf config
using MmaTileShape = Shape<_256,_256,_256>; // MMA's tile size
using ClusterShape = Shape<_4,_4,_1>; // Shape of the threadblocks in a cluster
using ClusterShape = Shape<_2,_4,_1>; // Shape of the threadblocks in a cluster
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass,
@ -511,10 +511,10 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 10 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}
}
//
// Parse options

View File

@ -566,8 +566,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 10 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}

View File

@ -117,7 +117,7 @@ using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // O
// Kernel Perf config
using MmaTileShape = Shape<_256,_256,_256>; // MMA's tile size
using ClusterShape = Shape<_4,_4,_1>; // Shape of the threadblocks in a cluster
using ClusterShape = Shape<_2,_4,_1>; // Shape of the threadblocks in a cluster
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass,
@ -512,8 +512,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 10 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}

View File

@ -28,7 +28,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
cutlass_example_add_executable(
72a_blackwell_nvfp4_bf16_gemm
72a_blackwell_nvfp4_bf16_gemm.cu

View File

@ -28,7 +28,7 @@
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
cutlass_example_add_executable(
73_blackwell_gemm_preferred_cluster
blackwell_gemm_preferred_cluster.cu

View File

@ -513,7 +513,7 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (props.major != 10 || props.minor != 0) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
return 0;
}

View File

@ -29,9 +29,9 @@
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
cutlass_example_add_executable(
74_blackwell_gemm_streamk
blackwell_gemm_streamk.cu
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
cutlass_example_add_executable(
74_blackwell_gemm_streamk
blackwell_gemm_streamk.cu
)
endif()

View File

@ -556,10 +556,19 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
if (__CUDACC_VER_MAJOR__ < 13) {
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}
}
else {
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
return 0;
}
}
//
// Parse options
//

View File

@ -762,9 +762,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 10 && props.minor == 0)) {
std::cerr
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}

View File

@ -138,8 +138,7 @@ using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor
// Core kernel configurations
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using EpilogueOperatorClass = cutlass::arch::OpClassTensorOp; // Epilogue Operator class tag
using MainloopOperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Mainloop Operator class tag
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized based on the tile size
// Runtime Cluster Shape
@ -159,7 +158,7 @@ struct MMA2SMConfig {
};
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, EpilogueOperatorClass,
ArchTag, OperatorClass,
typename MMA1SMConfig::MmaTileShape, ClusterShape,
Shape<_128,_64>,
ElementAccumulator, ElementAccumulator,
@ -169,7 +168,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
// , FusionOperation // Enable for SF Output
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, MainloopOperatorClass,
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
@ -187,7 +186,7 @@ using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using Gemm = Gemm1SM;
using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, EpilogueOperatorClass,
ArchTag, OperatorClass,
typename MMA2SMConfig::MmaTileShape, ClusterShape,
Shape<_128,_64>,
ElementAccumulator, ElementAccumulator,
@ -197,13 +196,13 @@ using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::Collective
// , FusionOperation // Enable for SF Output
>::CollectiveOp;
using CollectiveMainloop2SM = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, MainloopOperatorClass,
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
typename MMA2SMConfig::MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
static_cast<int>(sizeof(typename CollectiveEpilogue2SM::SharedStorage))>,
typename MMA2SMConfig::KernelSchedule
>::CollectiveOp;
using GemmKernel2SM = cutlass::gemm::kernel::GemmUniversal<
@ -233,7 +232,7 @@ using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
std::vector<StrideA> stride_A_host;
std::vector<StrideB> stride_B_host;
std::vector<LayoutSFA> layout_SFA_host;
std::vector<LayoutSFA> layout_SFB_host;
std::vector<LayoutSFB> layout_SFB_host;
std::vector<StrideC> stride_C_host;
std::vector<StrideD> stride_D_host;
@ -897,9 +896,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 10 && props.minor == 0)) {
std::cerr
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
return 0;
}

View File

@ -49,7 +49,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
set(TEST_RANDOM_PERF --iterations=10) # Random problem sizes
set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10) # Random problem sizes
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
if(CUTLASS_NVCC_ARCHS STREQUAL "100a")
cutlass_example_add_executable(
75_blackwell_grouped_gemm
75_blackwell_grouped_gemm.cu

View File

@ -504,10 +504,19 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
if (__CUDACC_VER_MAJOR__ < 13) {
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
}
}
else {
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
return 0;
}
}
//
// Parse options
//

View File

@ -504,10 +504,19 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
if (__CUDACC_VER_MAJOR__ < 13) {
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
}
}
else {
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
return 0;
}
}
//
// Parse options
//

View File

@ -500,10 +500,19 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
}
if (__CUDACC_VER_MAJOR__ < 13) {
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
return 0;
}
}
else {
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
return 0;
}
}
//
// Parse options
//

View File

@ -163,7 +163,7 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
77_blackwell_mla.cu
TEST_COMMAND_OPTIONS
TEST_MLA_BASIC
TEST_MLA_SEP_REDUCTION
TEST_MLA_SEP_REDUCTION
TEST_MLA_FUSE_REDUCTION
)
target_include_directories(77_blackwell_mla_2sm_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
@ -175,8 +175,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
77_blackwell_mla.cu
TEST_COMMAND_OPTIONS
TEST_MLA_BASIC
TEST_MLA_SEP_REDUCTION
TEST_MLA_FUSE_REDUCTION
TEST_MLA_SEP_REDUCTION
TEST_MLA_FUSE_REDUCTION
)
target_include_directories(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_compile_definitions(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${PREC_MACRO} CPASYNC)

View File

@ -100,7 +100,7 @@ public:
cutlass::fmha::kernel::FmhaKernelBwdConvert<ProblemShape, Element, ElementAccumulator>
>;
using OperationMha= cutlass::fmha::device::FMHA<
using OperationNormal= cutlass::fmha::device::FMHA<
cutlass::fmha::kernel::Sm100FmhaBwdKernelTmaWarpSpecialized<
ProblemShape, Element, ElementAccumulator, TileShape, Mask
>
@ -112,7 +112,7 @@ public:
>
>;
using Operation = std::conditional_t<IsMla, OperationMla, OperationMha>;
using Operation = std::conditional_t<IsMla, OperationMla, OperationNormal>;
using Kernel = typename Operation::Kernel;

View File

@ -365,7 +365,7 @@ struct Sm100FmhaGenKernelWarpspecialized {
pipeline_corr_epi_params.role = CollectiveMainloop::PipelineE::ThreadCategory::Consumer;
}
pipeline_corr_epi_params.producer_arv_count = NumWarpsCorrection * cutlass::NumThreadsPerWarp;
pipeline_corr_epi_params.consumer_arv_count = NumWarpsEpilogue * cutlass::NumThreadsPerWarp;
pipeline_corr_epi_params.consumer_arv_count = cute::max(1, NumWarpsEpilogue * cutlass::NumThreadsPerWarp);
typename CollectiveMainloop::PipelineE pipeline_corr_epi(
shared_storage.pipelines.corr_epi,
pipeline_corr_epi_params,

View File

@ -117,7 +117,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementC, LayoutC, AlignmentC,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
cutlass::epilogue::FastF32NoSmemWarpSpecialized2Sm
>::CollectiveOp;
// Build the mainloop

View File

@ -88,7 +88,7 @@
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
// Reference Output Tensor
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
@ -283,7 +283,7 @@ struct Result
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
@ -489,19 +489,28 @@ int run(Options &options)
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
// and must have compute capability at least 100.
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
@ -509,8 +518,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 12 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
return 0;
}
@ -530,9 +539,9 @@ int main(int argc, char const **args) {
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}

View File

@ -86,7 +86,7 @@
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -217,7 +217,7 @@ cutlass::HostTensor<ElementSFD, cutlass::layout::PackedVectorLayout> block_refer
// Matrix-wide normalization constant
cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
@ -311,7 +311,7 @@ struct Result
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
@ -536,19 +536,28 @@ int run(Options &options)
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
// and must have compute capability at least 100.
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
@ -556,8 +565,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 12 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
return 0;
}
@ -577,9 +586,9 @@ int main(int argc, char const **args) {
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}

View File

@ -88,7 +88,7 @@
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
// Reference Output Tensor
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
@ -283,7 +283,7 @@ struct Result
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
@ -489,19 +489,28 @@ int run(Options &options)
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
// and must have compute capability at least 100.
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
@ -509,8 +518,8 @@ int main(int argc, char const **args) {
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 12 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
return 0;
}
@ -530,9 +539,9 @@ int main(int argc, char const **args) {
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}

View File

@ -97,7 +97,7 @@ using namespace cute;
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
using ElementInput = cutlass::float_e2m1_t; // Element type for Input matrix operands
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -263,7 +263,7 @@ cutlass::DeviceAllocation<ElementAccumulator> block_beta;
// NormConst is a single device-side constant value, its not per-batch or per-group
cutlass::DeviceAllocation<ElementAccumulator> norm_constant_device;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
@ -466,7 +466,7 @@ struct Result
bool passed = false;
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
@ -861,30 +861,39 @@ int run(Options &options, bool host_problem_shapes_available = true)
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 ||
((__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)
)
) {
std::cerr << "This example requires CUDA 12.8 or newer.\n";
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support.\n";
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support.\n";
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 12 && props.minor == 0)) {
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120a).\n";
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120 or 121).\n";
return 0;
}
@ -901,7 +910,7 @@ int main(int argc, char const **args) {
return 0;
}
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
allocate(options);
initialize(options);

View File

@ -46,7 +46,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
set(TEST_RANDOM_PERF --iterations=10) # Random problem sizes
set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10) # Random problem sizes
if (CUTLASS_NVCC_ARCHS MATCHES 120a)
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
cutlass_example_add_executable(
79a_blackwell_geforce_nvfp4_bf16_gemm
79a_blackwell_geforce_nvfp4_bf16_gemm.cu

View File

@ -78,7 +78,7 @@
#include "helper.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -248,7 +248,7 @@ struct Result
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -507,25 +507,34 @@ int run(Options &options)
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
// and must have compute capability at least 120.
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 12 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
return 0;
}
//
@ -540,9 +549,9 @@ int main(int argc, char const **args) {
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -78,7 +78,7 @@
#include "helper.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -183,7 +183,7 @@ cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> bloc
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> block_reference_SFD;
cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
return cute::recast_ptr<T>(ptr);
@ -259,7 +259,7 @@ struct Result
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -531,25 +531,34 @@ int run(Options &options)
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
// and must have compute capability at least 120.
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 12 && props.minor == 0)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
return 0;
}
//
@ -564,9 +573,9 @@ int main(int argc, char const **args) {
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -27,7 +27,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (CUTLASS_NVCC_ARCHS MATCHES 120a)
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
cutlass_example_add_executable(
80a_blackwell_geforce_mxfp8_bf16_sparse_gemm
80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu

View File

@ -0,0 +1,104 @@
# Blockwise and Groupwise GEMM and Grouped GEMM on Blackwell
Blockwise and Groupwise GEMM and Grouped GEMM implement software scaling by the accumulator type.
The examples in this directory aim to demonstrate how we can instantiate this kernel and run it.
The profiler enables instantiating and profiling different kernel configurations for Blockwise and Groupwise GEMM
to determine the best performing kernel for your workload.
## Introduction
Blockwise and Groupwise GEMM operations enable fine-grained numerical precision control by applying scale factors at configurable granularities. This is particularly useful for quantized neural networks where different regions of tensors may have different scaling requirements.
For a GEMM $D = \alpha A B + \beta C$, we introduce two scale factor tensors, SFA
and SFB. This leads to a GEMM $D = \alpha \text{SFA} * A \text{ SFB} * B + \beta C$.
## Scale Factor Tensors
- *SFA*: Broadcast the same scale within a block defined by _scale granularity m_ and _scale granularity k_ when scaling A.
- Scale granularity m and scale granularity k are also referred to as _scale vector m_ and _k_ respectively.
- *SFB*: Broadcast the same scale within a block defined by _scale granularity n_ and _scale granularity k_ when scaling B.
- Scale granularity n and scale granularity k are also referred to as _scale vector n_ and _k_ respectively.
These can be represented in CuTe as:
- *SFA Layout*: $((\text{scale granularity M}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
- *SFB Layout*: $((\text{scale granularity N}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
The 0 element stride ensures the same group of coordinates to map to the same element in the scale factors.
## Configuration
For convenience the Blockwise and Groupwise implementation provide
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>`
to deduce layouts and manage compact tensors.
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>` by default makes
every tensor major the M/N mode, but can be configured. For example:
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, UMMA::Major::K, UMMA::Major::MN>`
denotes SFA will be major in the K dimension but SFB will be major in the N dimension.
## Integration with Other Frameworks
If translating from frameworks like Torch where SFA has shape
(M / ScaleGranularityM, K / ScaleGranularityK) and SFB has a shape (K / ScaleGranularityK, N / ScaleGranularityN),
ensure to transpose SFB and B to fit into the canonical CuTe layout form. This ensures K is always the second mode.
Use strides can be used to determine if each tensor is MN or K major to correctly form the layouts either directly
or with the convenience wrappers.
## Kernel Selection and Profiling
To determine the most performance Blockwise/Groupwise GEMM or Grouped GEMM kernel for your use case, you can utilize the
[CUTLASS profiler](../../media/docs/cpp/profiler.md).
All Blockwise/Groupwise GEMMs and Group GEMMs with `f32` scaling of `e4m3` or runtime `f8` types can be selected by
selecting a subset of kernels when configuring with CMake by passing:
`-DCUTLASS_LIBRARY_KERNELS="cutlass3x*f32xe4m3_*f32xe4m3*,cutlass3x*f32xf8_*f32xf8*"`.
The simplest way to use the profiler is to pass `m`, `n`, and `k` as well as your `scale_vec_size_m`,
`scale_vec_size_n`, and `scale_vec_size_k`. Passing `enable-best-kernel-for-fixed-shape` will do some autotuning
per kernel to determine best rasterization orders, swizzles, and cluster sizes. Passing `blockwiseGemm`
or `GroupedGemm` through the operation flag will determine which set of operations will be profiled.
For examle, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale
granularity m = 1, scale granularity n = 128, and scale granularity k = 128 for the problem size 8192x8192x8192:
```
cutlass_profiler --operation=blockwiseGemm \
--enable-best-kernel-for-fixed-shape \
--m=8192 --n=8192 --k=8192 \
--scale_vec_size_m=1 --scale_vec_size_n=128 --scale_vec_size_k=128 \
--verification-enabled=false
```
### Kernel Naming Convention
The naming of the blockwise and groupwise kernels includes the following new pattern: for each tensor scalar pair we have
`<scale_granularity_m or scale_granularity_n>x<scale_granularity_k><accumulator type>x<scaled tensor type>`. For example
`cutlass3x_sm100_tensorop_gemm_64x128f32xe4m3_1x128f32xe4m3_f32_f16_f16_64x128x128_1x1x1_0_nnn_align16_1sm` would denote:
- A CUTLASS 3 GEMM for SM100 that uses tensor cores.
- SFA is f32 with a 64 element scale granularity m and a 128 element scale granularity k.
- The A matrix is e4m3.
- SFB is f32 with a 1 element scale granularity n and a 128 element scale granularity k.
- The B matrix is e4m3.
- The epilogue is done in f32.
- The C matrix is f16.
- The D matrix is f16.
- The MMA tile shape is 64x128x128.
- The cluster shape is 1x1x1.
- A, B, C, and D are all column major.
- The alignment of the major modes are 16 elements for A, B, C, and D.
- The MMA variant is a 1SM instruction.
It is also worthwhile to note that C can be void if scaling by beta is not needed.
## Performance Tips and Tricks
- *MMA Dimensions*: in both Blackwell and Hopper tensor cores it is worthwhile to note that the smallest `MMA_M` dimension is 64, but `MMA_N`
dimension can be as small as 8 for some instructions. For problem sizes where M is small consider computing $D^T = \alpha B^T A^T + \beta C^T$ instead.
- When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unecessary computation.
- *Layout Swapping*: When optimizing with the profiler swap `m` and `n` inputs and adjust layouts to reflect this swapping and transposing.
- For example if we have a row-major A, column-major B, and row-major D, we can swap tensors and run a kernel with:
- The left hand matrix as row-major (since B transposed is row-major)
- A right hand matrix as column-major (since A transposed is column-major)
- A column-major output (since D transposed is column-major).
When using blockwise and groupwise GEMM we must swap the scale vector sizes when doing this optimization. If we have a 1 element scale granularity M
and a 128 element scale granularity N, we must run a kernel with a 128 element scale granularity M and a 1 element scale granularity
N.

View File

@ -0,0 +1,495 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/mixed_dtype_utils.hpp"
#include "cutlass/detail/collective/mixed_input_utils.hpp"
#include "helper.h"
#include "mixed_dtype_helper.cuh"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
using MmaType = cutlass::bfloat16_t;
using QuantType = cutlass::int4b_t;
using AccumulatorType = float;
// A matrix configuration
using ElementA = MmaType; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = QuantType; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// This example manually swaps and transposes, so keep transpose of input layouts
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
using ElementZero = MmaType;
using ElementScale = MmaType;
// C/D matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutD = cutlass::layout::RowMajor;
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
// Core kernel configurations
using ElementAccumulator = AccumulatorType; // Element type for internal accumulation
using ElementCompute = AccumulatorType; // Element type for epilogue computation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using MmaTileShape = Shape<_256,_128,_128>; // (MmaTileShape_N, MmaTileShape_M, MmaTileShape_K) as A and B will be swapped
using ClusterShape = Shape<_2,_1,_1>; // Shape of the threadblocks in a cluster
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmMixedInputSm100; // Kernel to launch based on the default setting in the Collective Builder
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
constexpr int ScaleGranularityN = 1; //Should be less than or equal to GEMM_N
constexpr int ScaleGranularityK = 128; //Should be less than or equal to GEMM_K
using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig<ScaleGranularityN, ScaleGranularityK>;
using LayoutScale = decltype(ScaleConfig::deduce_layout_scale()); // Layout type for SFA matrix operand
LayoutScale layout_S;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass,
MmaTileShape, ClusterShape,
EpilogueTileType,
ElementAccumulator, ElementCompute,
// Transpose layout of D here since we use explicit swap + transpose
// the void type for C tells the builder to allocate 0 smem for the C matrix.
// We can enable this if beta == 0 by changing ElementC to void below.
ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
EpilogueSchedule // This is the only epi supporting the required swap + transpose.
>::CollectiveOp;
// ============================================================ MIXED INPUT NO SCALES ============================================================================
//The collective will infer that the narrow type should be upcasted to the wide type.
//We swap A and B operands to the builder here
using CollectiveMainloopConvertOnly = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
cute::tuple<ElementB>, LayoutB_Transpose, AlignmentB,
ElementA, LayoutA_Transpose, AlignmentA,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
>,
MainloopSchedule
>::CollectiveOp;
using GemmKernelConvertOnly = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>, // Indicates ProblemShape
CollectiveMainloopConvertOnly,
CollectiveEpilogue
>;
using GemmConvertOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnly>;
// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
cute::tuple<ElementB, ElementScale>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
ElementA, LayoutA_Transpose, AlignmentA,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
>,
MainloopSchedule
>::CollectiveOp;
using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>, // Indicates ProblemShape
CollectiveMainloopScaleOnly,
CollectiveEpilogue
>;
using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
// =========================================================== MIXED INPUT WITH SCALES AND ZEROS ==================================================================
// We specify scale + zero elements to indicate that we require both. Scales and biases have the same format.
using CollectiveMainloopScaleWithZeroPoint = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
cute::tuple<ElementB, ElementScale, ElementZero>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
ElementA, LayoutA_Transpose, AlignmentA,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
>,
MainloopSchedule
>::CollectiveOp;
using GemmKernelScaleWithZeroPoint = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>, // Indicates ProblemShape
CollectiveMainloopScaleWithZeroPoint,
CollectiveEpilogue
>;
using GemmScaleWithZeroPoint = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPoint>;
// =================================================================================================================================================================
using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
using StrideC = typename GemmKernelScaleOnly::StrideC;
using StrideD = typename GemmKernelScaleOnly::StrideD;
using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
//
// Data members
//
/// Initialization
StrideA stride_A;
StrideB stride_B;
StrideC stride_C;
StrideC_ref stride_C_ref;
StrideD stride_D;
StrideD_ref stride_D_ref;
uint64_t seed;
// Scale and Zero share a stride since the layout and shapes must be the same.
using StrideS = typename cute::Stride<cute::Int<1>, int64_t, int64_t>;
using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
StrideS stride_S;
StrideS_ref stride_S_ref;
cutlass::DeviceAllocation<ElementA> block_A;
cutlass::DeviceAllocation<ElementB> block_B;
cutlass::DeviceAllocation<MmaType> block_B_dq;
cutlass::DeviceAllocation<ElementScale> block_scale;
cutlass::DeviceAllocation<ElementZero> block_zero;
cutlass::DeviceAllocation<ElementC> block_C;
cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_D;
cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_ref_D;
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(MixedDtypeOptions const& options) {
auto shape_b = cute::make_shape(options.n, options.k, options.l);
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_b);
// Reverse stride here due to swap and transpose
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
// Reverse stride here due to swap and transpose
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
layout_S = ScaleConfig::tile_atom_to_shape_scale(make_shape(options.n, options.k, options.l));
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_S)));
block_A.reset(a_coord.product());
block_B.reset(b_coord.product());
block_B_dq.reset(b_coord.product());
block_C.reset(c_coord.product());
block_D.reset(c_coord.product());
block_ref_D.reset(c_coord.product());
block_scale.reset(blockscale_b_coord.product());
block_zero.reset(blockscale_b_coord.product());
initialize_tensor(block_A, seed + 2022);
initialize_quant_tensor(block_B, seed + 2021);
initialize_tensor(block_C, seed + 2020);
initialize_scale<QuantType, ElementScale>(block_scale, options);
initialize_zero(block_zero, options);
if(options.verify){
auto layout_B = make_layout(shape_b, stride_B);
auto scale_stride = layout_S.stride();
auto layout_scale_zero = make_layout(
make_shape(size<0>(layout_S), size<1,1>(layout_S), size<2>(layout_S)),
make_stride(size<0,1>(scale_stride), size<1,1>(scale_stride), size<2>(scale_stride))
); //layout = (options.n, scale_k, options.l) : (_1, options.n, _0)
cudaStream_t stream = cudaStreamDefault;
cutlass::dequantize(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, ScaleGranularityK, stream);
}
}
/// Populates a Gemm::Arguments structure from the given commandline options
template <class Args, cutlass::detail::ConversionMode KernelConversionMode>
Args args_from_options(MixedDtypeOptions const& options)
{
// Swap the A and B tensors, as well as problem shapes here.
if constexpr (KernelConversionMode == cutlass::detail::ConversionMode::DirectConvert) {
return Args {
cutlass::gemm::GemmUniversalMode::kGemm,
{options.n, options.m, options.k, options.l},
{block_B.get(), stride_B, block_A.get(), stride_A},
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
};
}
else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScale) {
return Args {
cutlass::gemm::GemmUniversalMode::kGemm,
{options.n, options.m, options.k, options.l},
{block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S},
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
};
}
else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScaleWithZero) {
return Args {
cutlass::gemm::GemmUniversalMode::kGemm,
{options.n, options.m, options.k, options.l},
{block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S, block_zero.get()},
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
};
} else {
exit(-1);
}
}
bool verify(MixedDtypeOptions const& options) {
//
// Compute reference output
//
using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
MmaType, LayoutA, AlignmentA,
MmaType, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>, // Indicates ProblemShape
CollectiveMainloopRef,
CollectiveEpilogueRef
>;
using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
typename GemmRef::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
{options.m, options.n, options.k, options.l},
{block_A.get(), stride_A, block_B_dq.get(), stride_B},
{{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
};
// Run the gemm where the scaling is performed outside of the kernel.
GemmRef gemm_ref;
size_t workspace_size = GemmRef::get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
CUTLASS_CHECK(gemm_ref.can_implement(arguments));
CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
CUTLASS_CHECK(gemm_ref.run());
// compare_reference
ElementD const epsilon(1e-2f);
ElementD const non_zero_floor(1e-2f);
bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
return passed;
}
/// Execute a given example GEMM computation
template <typename Gemm>
int run(MixedDtypeOptions &options)
{
initialize(options);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto arguments = args_from_options<typename Gemm::Arguments, Gemm::CollectiveMainloop::KernelConversionMode>(options);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check if the problem size is supported or not
CUTLASS_CHECK(gemm.can_implement(arguments));
// Initialize CUTLASS kernel with arguments and workspace pointer
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
CUTLASS_CHECK(gemm.run());
// Check if output from CUTLASS kernel and reference kernel are equal or not
MixedDtypeResult result;
if(options.verify){
result.passed = verify(options);
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
}
else{
result.passed = true;
std::cout << " Verification: Off " << std::endl;
}
if (!result.passed) {
exit(-1);
}
mixed_dtype_profiling(gemm, options, result);
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
// and must have compute capability at least 100a.
bool is_correct_cuda_version = (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 8);
if (!is_correct_cuda_version) {
std::cerr << "Version is " << __CUDACC_VER_MINOR__ << "\n";
std::cerr << "This example requires CUDA 12.8 or newer.\n";
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (props.major != 10 || props.minor != 0) {
std::cerr
<< "This example requires a GPU of NVIDIA's Blackwell Architecture or "
<< "later (compute capability 100a or greater).\n";
return 0;
}
//
// Parse options
//
MixedDtypeOptions options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
std::cout << "Running in conversion only mode." << std::endl;
run<GemmConvertOnly>(options);
}
else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
std::cout << "Running in scale mode." << std::endl;
run<GemmScaleOnly>(options);
}
else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
std::cout << "Running in scale and zero mode." << std::endl;
run<GemmScaleWithZeroPoint>(options);
}
else{
std::cerr << "Invalid mode " << options.mode << ". Must be 0, 1 or 2." << std::endl;
}
#endif
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,45 @@
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
set(TEST_S_TILE_SHAPE --m=256 --n=128 --k=32 --verify --iterations=0)
set(TEST_S_TILE_SHAPE_MULTIPLE_KITER --m=256 --n=128 --k=128 --verify --iterations=0)
set(TEST_S_DIFFERENT_MN --m=16384 --n=4608 --k=4608 --verify --iterations=0)
set(TEST_S_ONE_WAVE --m=1536 --n=1536 --k=32 --verify --iterations=0) # Assuming 144 SMs
set(TEST_S_2048 --m=2048 --n=2048 --k=2048 --verify --iterations=0) # Multi-wave
if(NOT WIN32)
cutlass_example_add_executable(
86_blackwell_mixed_dtype_gemm
86_blackwell_mixed_dtype.cu
TEST_COMMAND_OPTIONS
TEST_S_TILE_SHAPE
TEST_S_TILE_SHAPE_MULTIPLE_KITER
TEST_S_ONE_WAVE
TEST_S_2048
)
endif()

View File

@ -0,0 +1,269 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/util/command_line.h"
#include "cutlass/util/reference/device/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cute/tensor.hpp"
#include <cuda.h>
#include <numeric>
#include "helper.h"
enum MixedDtypeGemmMode {
ConvertOnly,
ScaleOnly,
ScaleWithZeroPoint
};
/// Command line options parsing
struct MixedDtypeOptions {
bool help = false;
bool verify = false;
float alpha = 1.0f;
float beta = 0.0f;
int iterations = 1000;
int warmup = 1000;
int mode = 1;
int m = 5120, n = 4096, k = 4096;
int l = 1;
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
if (cmd.check_cmd_line_flag("verify")) {
verify = true;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("l", l);
cmd.get_cmd_line_argument("mode", mode);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
cmd.get_cmd_line_argument("warmup", warmup);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "86_blackwell_mixed_dtype_gemm\n\n"
<< " Blackwell Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --l=<int> The number of independent gemm problems with mnk shape\n"
<< " --mode=<int> The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
<< " --warmup=<int> Number of warmup iterations to perform.\n\n"
<< " --verify=<int> Run verification.\n\n";
out
<< "\n\nExamples:\n\n"
<< "$ " << "86_blackwell_mixed_dtype_gemm" << " --m=1024 --n=512 --k=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const
{
// Two flops per multiply-add
uint64_t flop = uint64_t(2) * m * n * k * l;
double gflop = double(flop) / double(1.0e9);
return gflop / runtime_s;
}
};
/// Result structure
struct MixedDtypeResult
{
double avg_runtime_ms = 0.0;
double gflops = 0.0;
cutlass::Status status = cutlass::Status::kSuccess;
cudaError_t error = cudaSuccess;
bool passed = false;
};
/// Profiling Loop
template <class Gemm>
void mixed_dtype_profiling(
Gemm& gemm,
MixedDtypeOptions const& options,
MixedDtypeResult& result) {
if (options.iterations <= 0) return;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
std::vector<float> runtimes;
runtimes.reserve(options.iterations);
for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
cudaEventRecord(start);
CUTLASS_CHECK(gemm.run());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
if (iter >= options.warmup) {
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
runtimes.push_back(milliseconds);
}
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Compute average setup and runtime and GFLOPs.
result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
/// Helpers to initialize a block of device data
template <class Element>
bool initialize_tensor(
cutlass::DeviceAllocation<Element>& block,
uint64_t seed = 2023) {
double scope_max, scope_min;
int bits_input = cutlass::sizeof_bits<Element>::value;
int bits_output = cutlass::sizeof_bits<Element>::value;
if (bits_input == 1) {
scope_max = 2;
scope_min = 0;
}
else if (bits_input <= 8) {
scope_max = 2;
scope_min = -2;
}
else if (bits_output == 16) {
scope_max = 5;
scope_min = -5;
}
else {
scope_max = 8;
scope_min = -8;
}
cutlass::reference::device::BlockFillRandomUniform(
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
return true;
}
template <typename Element>
bool initialize_quant_tensor(
cutlass::DeviceAllocation<Element>& block,
uint64_t seed = 2023) {
float scope_min = float(cutlass::platform::numeric_limits<Element>::lowest());
float scope_max = float(cutlass::platform::numeric_limits<Element>::max());
cutlass::reference::device::BlockFillRandomUniform(
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
return true;
}
template <class QuantType, class Element>
bool initialize_scale(
cutlass::DeviceAllocation<Element>& block,
MixedDtypeOptions const& options,
uint64_t seed = 2023) {
if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
// No scales, so just initialize with 1 so we can use the same kernel to dequantize the data.
std::vector<Element> stage(block.size(), Element(1.0f));
block.copy_from_host(stage.data());
}
else {
float elt_max_f = float(cutlass::platform::numeric_limits<QuantType>::max());
const float max_dequant_val = 4.f;
const float min_dequant_val = 0.5f;
float scope_max(max_dequant_val / elt_max_f);
float scope_min(min_dequant_val / elt_max_f);
cutlass::reference::device::BlockFillRandomUniform(
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
}
return true;
}
template <class Element>
bool initialize_zero(
cutlass::DeviceAllocation<Element>& block,
MixedDtypeOptions const& options,
uint64_t seed = 2023) {
if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
cutlass::reference::device::BlockFillRandomUniform(
block.get(), block.size(), seed, Element(2.0f), Element(-2.0f));
} else {
// No bias, so just initialize with 1 so we can use the same kernel to dequantize the data.
std::vector<Element> stage(block.size(), Element(0.0f));
block.copy_from_host(stage.data());
}
return true;
}

View File

@ -0,0 +1,518 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief An FP8 blockwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
This example demonstrates a simple way to instantiate and run a blockwise scaling FP8 GEMM on the NVIDIA Blackwell SM120 architecture.
This kernel is optimized for the GeForce RTX 50 series GPUs.
This kernel accepts Inputs A and B with TileMxTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
3. Epilogue Optimization
Note that GeForce RTX 50 series GPUs do not support:
1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
2. Dynamic datatypes.
3. Runtime scaling block size.
Usage:
$ ./examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise --m=2048 --n=2048 --k=2048
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/thread/activation.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "helper.h"
#include "./utils.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
// A matrix configuration
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// C/D matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
using ElementD = ElementC;
using LayoutD = LayoutC;
constexpr int AlignmentD = AlignmentC;
// MMA type
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
using ElementCompute = float;
// MMA and Cluster Tile Shapes
// Shape of the tile
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Shape of the threadblocks in a cluster
using ClusterShape_MNK = Shape<_1,_1,_1>;
using ScaleConfig = decltype(cutlass::detail::sm120_trivial_blockwise_scale_config(MmaTileShape_MNK{}));
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutC, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
using StrideC = typename Gemm::GemmKernel::StrideC;
using StrideD = typename Gemm::GemmKernel::StrideD;
/// Initialization
StrideA stride_A;
StrideB stride_B;
StrideC stride_C;
StrideD stride_D;
// Strides just iterate over scalars and have no zeros
LayoutSFA layout_SFA;
LayoutSFB layout_SFB;
// Layouts are tiled to the problem size and the strides have zeros
uint64_t seed;
cutlass::HostTensor<ElementA , LayoutA> tensor_A;
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
cutlass::HostTensor<ElementB , LayoutB> tensor_B;
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
cutlass::HostTensor<ElementC , LayoutC> tensor_C;
cutlass::HostTensor<ElementD , LayoutD> tensor_D;
cutlass::HostTensor<ElementD , LayoutD> tensor_ref_D;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help = false;
bool skip_verification = false;
float alpha = 1.f, beta = 0.f;
int iterations = 1000;
int m = 1024, n = 512, k = 1024, l = 1;
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
if (cmd.check_cmd_line_flag("skip-verification")) {
skip_verification = true;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("l", l);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "87a_blackwell_geforce_gemm_blockwise\n\n"
<< " Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --l=<int> Sets the l extent (batch) of the GEMM\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
<< " --skip-verification Skip verification.\n\n";
out
<< "\n\nExamples:\n\n"
<< "$ " << "87a_blackwell_geforce_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Two flops per multiply-add
uint64_t flop = uint64_t(2) * m * n * k;
double gflop = double(flop) / double(1.0e9);
return gflop / runtime_s;
}
};
/// Result structure
struct Result {
double avg_runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
Result(
double avg_runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess)
:
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const Options &options) {
using namespace cute;
auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
tensor_A.resize(a_coord);
tensor_B.resize(b_coord);
tensor_C.resize(c_coord);
tensor_D.resize(c_coord);
tensor_ref_D.resize(c_coord);
tensor_SFA.resize(blockscale_a_coord);
tensor_SFB.resize(blockscale_b_coord);
initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
tensor_A.sync_device();
tensor_B.sync_device();
tensor_C.sync_device();
tensor_D.sync_device();
tensor_SFA.sync_device();
tensor_SFB.sync_device();
}
/// Populates a Gemm::Arguments structure from the given commandline options
typename Gemm::Arguments args_from_options(const Options &options) {
typename Gemm::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
{options.m, options.n, options.k, options.l},
{tensor_A.device_data(), stride_A,
tensor_B.device_data(), stride_B,
tensor_SFA.device_data(), layout_SFA,
tensor_SFB.device_data(), layout_SFB},
{
{}, // epilogue.thread
tensor_C.device_data(), stride_C,
tensor_D.device_data(), stride_D
}
};
auto &fusion_args = arguments.epilogue.thread;
fusion_args.alpha = options.alpha;
fusion_args.beta = options.beta;
return arguments;
}
bool verify(Options const& options) {
//
// Compute reference output
//
// Create instantiation for device reference gemm kernel
auto A = cute::make_tensor(tensor_A.host_data(),
cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
auto B = cute::make_tensor(tensor_B.host_data(),
cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
auto C = cute::make_tensor(tensor_C.host_data(),
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
auto D = cute::make_tensor(tensor_ref_D.host_data(),
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
using unused_t = decltype(D);
cutlass::reference::host::GettBlockScalingMainloopParams<
ElementAccumulator,
decltype(A),
decltype(SFA),
decltype(B),
decltype(SFB)
> mainloop_params{A, SFA, B, SFB};
cutlass::reference::host::GettEpilogueParams<
ElementAccumulator,
ElementAccumulator,
ElementAccumulator,
ElementCompute,
decltype(C),
decltype(D)
> epilogue_params;
epilogue_params.C = C;
epilogue_params.D = D;
epilogue_params.alpha = options.alpha;
epilogue_params.beta = options.beta;
// get reference result
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
// compare_reference
tensor_D.sync_host();
bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
return passed;
}
/// Execute a given example GEMM computation
template <class Gemm>
int run(Options &options) {
initialize(options);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto arguments = args_from_options(options);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check if the problem size is supported or not
CUTLASS_CHECK(gemm.can_implement(arguments));
// Initialize CUTLASS kernel with arguments and workspace pointer
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
CUTLASS_CHECK(gemm.run());
Result result;
if (!options.skip_verification) {
// Check if output from CUTLASS kernel and reference kernel are equal or not
result.passed = verify(options);
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
if (!result.passed) {
exit(-1);
}
}
// Run profiling loop
if (options.iterations > 0) {
GpuTimer timer;
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.run());
}
timer.stop();
// Compute average runtime and GFLOPs.
float elapsed_ms = timer.elapsed_millis();
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
return 0;
}
//
// Parse options
//
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
//
// Run
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,539 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief An FP8 groupwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
This example demonstrates a simple way to instantiate and run cooperative and ping-pong groupwise scaling FP8 GEMMs on the NVIDIA Blackwell SM120 architecture.
These kernels are optimized for GeForce RTX 50 series GPUs.
The blockscaling kernels accept Inputs A and B with 1xTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
The ping-pong kernel leverages a smaller tile shape to avoid register spilling for better performance.
Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
3. Epilogue Optimization
Note that GeForce RTX 50 series GPUs do not support:
1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
2. Dynamic datatypes.
3. Runtime scaling block size.
Usage:
$ ./examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise --m=2048 --n=2048 --k=2048
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/thread/activation.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "helper.h"
#include "./utils.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
// A matrix configuration
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// C/D matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
using ElementD = ElementC;
using LayoutD = LayoutC;
constexpr int AlignmentD = AlignmentC;
// MMA type
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
using ElementCompute = float;
// MMA and Cluster Tile Shapes
// Shape of the tile
using CooperativeMmaTileShape_MNK = Shape<_128,_128,_128>;
// Smaller tile size for pingpong schedule to avoid register spilling
using PingpongMmaTileShape_MNK = Shape<_64, _128, _128>;
// Shape of the threadblocks in a cluster
using ClusterShape_MNK = Shape<_1,_1,_1>;
constexpr int ScaleGranularityM = 1;
constexpr int ScaleGranularityN = 128;
constexpr int ScaleGranularityK = 128;
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
template <class TileShape>
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
TileShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutC, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
template <class TileShape, class Schedule>
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
ElementAccumulator,
TileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue<TileShape>::SharedStorage))>,
Schedule // cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120
>::CollectiveOp;
template <class TileShape, class Schedule>
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop<TileShape, Schedule>,
CollectiveEpilogue<TileShape>,
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
// We are using cooperative kernel schedule by default
using CooperativeGemm = cutlass::gemm::device::GemmUniversalAdapter<
GemmKernel<CooperativeMmaTileShape_MNK, cutlass::gemm::KernelScheduleSm120Blockwise>>;
// Pingpong kernel
using PingpongGemm = cutlass::gemm::device::GemmUniversalAdapter<
GemmKernel<PingpongMmaTileShape_MNK, cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120>>;
using StrideA = typename CooperativeGemm::GemmKernel::StrideA;
using StrideB = typename CooperativeGemm::GemmKernel::StrideB;
using StrideC = typename CooperativeGemm::GemmKernel::StrideC;
using StrideD = typename CooperativeGemm::GemmKernel::StrideD;
/// Initialization
StrideA stride_A;
StrideB stride_B;
StrideC stride_C;
StrideD stride_D;
// Strides just iterate over scalars and have no zeros
LayoutSFA layout_SFA;
LayoutSFB layout_SFB;
// Layouts are tiled to the problem size and the strides have zeros
uint64_t seed;
cutlass::HostTensor<ElementA , LayoutA> tensor_A;
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
cutlass::HostTensor<ElementB , LayoutB> tensor_B;
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
cutlass::HostTensor<ElementC , LayoutC> tensor_C;
cutlass::HostTensor<ElementD , LayoutD> tensor_D;
cutlass::HostTensor<ElementD , LayoutD> tensor_ref_D;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help = false;
bool skip_verification = false;
float alpha = 1.f, beta = 0.f;
int iterations = 1000;
int m = 1024, n = 512, k = 1024, l = 1;
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
if (cmd.check_cmd_line_flag("skip-verification")) {
skip_verification = true;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("l", l);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "87b_blackwell_geforce_gemm_groupwise\n\n"
<< " Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --l=<int> Sets the l extent (batch) of the GEMM\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
<< " --skip-verification Skip verification.\n\n";
out
<< "\n\nExamples:\n\n"
<< "$ " << "87b_blackwell_geforce_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Two flops per multiply-add
uint64_t flop = uint64_t(2) * m * n * k;
double gflop = double(flop) / double(1.0e9);
return gflop / runtime_s;
}
};
/// Result structure
struct Result {
double avg_runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
Result(
double avg_runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess)
:
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const Options &options) {
using namespace cute;
auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
tensor_A.resize(a_coord);
tensor_B.resize(b_coord);
tensor_C.resize(c_coord);
tensor_D.resize(c_coord);
tensor_ref_D.resize(c_coord);
tensor_SFA.resize(blockscale_a_coord);
tensor_SFB.resize(blockscale_b_coord);
initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
tensor_A.sync_device();
tensor_B.sync_device();
tensor_C.sync_device();
tensor_D.sync_device();
tensor_SFA.sync_device();
tensor_SFB.sync_device();
}
/// Populates a Gemm::Arguments structure from the given commandline options
template <class Gemm>
typename Gemm::Arguments args_from_options(const Options &options) {
typename Gemm::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
{options.m, options.n, options.k, options.l},
{tensor_A.device_data(), stride_A,
tensor_B.device_data(), stride_B,
tensor_SFA.device_data(), layout_SFA,
tensor_SFB.device_data(), layout_SFB},
{
{}, // epilogue.thread
tensor_C.device_data(), stride_C,
tensor_D.device_data(), stride_D
}
};
auto &fusion_args = arguments.epilogue.thread;
fusion_args.alpha = options.alpha;
fusion_args.beta = options.beta;
return arguments;
}
bool verify(const Options &options) {
//
// Compute reference output
//
// Create instantiation for device reference gemm kernel
auto A = cute::make_tensor(tensor_A.host_data(),
cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
auto B = cute::make_tensor(tensor_B.host_data(),
cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
auto C = cute::make_tensor(tensor_C.host_data(),
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
auto D = cute::make_tensor(tensor_ref_D.host_data(),
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
using unused_t = decltype(D);
cutlass::reference::host::GettBlockScalingMainloopParams<
ElementAccumulator,
decltype(A),
decltype(SFA),
decltype(B),
decltype(SFB)
> mainloop_params{A, SFA, B, SFB};
cutlass::reference::host::GettEpilogueParams<
ElementAccumulator,
ElementAccumulator,
ElementAccumulator,
ElementCompute,
decltype(C),
decltype(D)
> epilogue_params;
epilogue_params.C = C;
epilogue_params.D = D;
epilogue_params.alpha = options.alpha;
epilogue_params.beta = options.beta;
// get reference result
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
// compare_reference
tensor_D.sync_host();
bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
return passed;
}
/// Execute a given example GEMM computation
template <class Gemm>
int run(Options &options) {
initialize(options);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto arguments = args_from_options<Gemm>(options);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check if the problem size is supported or not
CUTLASS_CHECK(gemm.can_implement(arguments));
// Initialize CUTLASS kernel with arguments and workspace pointer
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
CUTLASS_CHECK(gemm.run());
Result result;
if (!options.skip_verification) {
// Check if output from CUTLASS kernel and reference kernel are equal or not
result.passed = verify(options);
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
if (!result.passed) {
exit(-1);
}
}
// Run profiling loop
if (options.iterations > 0) {
GpuTimer timer;
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.run());
}
timer.stop();
// Compute average runtime and GFLOPs.
float elapsed_ms = timer.elapsed_millis();
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
return 0;
}
//
// Parse options
//
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
//
// Run
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
printf("Running kernel with Cooperative kernel schedule:\n");
run<CooperativeGemm>(options);
printf("Running kernel with Pingpong kernel schedule:\n");
run<PingpongGemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,678 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief An FP8 groupwise scaled grouped GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
This example demonstrates an implementation of Grouped GEMM using a TMA + Blackwell SM120 TensorOp-based warp-specialized kernel
for FP8 with per-group:1x128x128 FP32 scaling factors.
In this example, M, N, and K are fixed across groups.
As RTX 50 series GPUs do not support runtime scaling block sizes, all groups share the same block scaling size.
For this example all scheduling work is performed on the device, utilizing the device-side modification of TMA descriptors
to move between groups/problem_count (represented by groups).
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#encoding-a-tensor-map-on-device
To run this example:
$ ./examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise --m=2048 --n=2048 --k=2048 --groups=10
The above example command makes all 10 groups to be sized at the given m, n, k sizes.
Same applies for alpha and beta values that are randomized across the different groups.
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/thread/activation.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "helper.h"
#include "./utils.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>;
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
// A matrix configuration
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// C/D matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
using ElementD = ElementC;
using LayoutD = LayoutC;
constexpr int AlignmentD = AlignmentC;
// MMA type
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
using ElementCompute = float;
// MMA and Cluster Tile Shapes
// Shape of the tile
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Shape of the threadblocks in a cluster
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Scaling Factors
using ElementSF = ElementAccumulator;
constexpr int ScaleGranularityM = 1;
constexpr int ScaleGranularityN = 128;
constexpr int ScaleGranularityK = 128;
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
ElementA, cute::tuple<LayoutA *, LayoutSFA *>, AlignmentA,
ElementB, cute::tuple<LayoutB *, LayoutSFB *>, AlignmentB,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelScheduleSm120Blockwise
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue,
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA, LayoutSFA>);
static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB, LayoutSFB>);
/// Initialization
uint64_t seed;
std::vector<StrideA> stride_A_host;
std::vector<StrideB> stride_B_host;
std::vector<StrideC> stride_C_host;
std::vector<StrideD> stride_D_host;
std::vector<LayoutSFA> layout_SFA_host;
std::vector<LayoutSFB> layout_SFB_host;
std::vector<ElementAccumulator> alpha_host;
std::vector<ElementAccumulator> beta_host;
using HostTensorA = cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout>;
using HostTensorB = cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout>;
using HostTensorC = cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout>;
using HostTensorD = cutlass::HostTensor<Gemm::EpilogueOutputOp::ElementOutput, cutlass::layout::PackedVectorLayout>;
using HostTensorSFA = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
using HostTensorSFB = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
std::vector<HostTensorA> block_A;
std::vector<HostTensorB> block_B;
std::vector<HostTensorC> block_C;
std::vector<HostTensorD> block_D;
std::vector<HostTensorD> block_ref_D;
std::vector<HostTensorSFA> block_SFA;
std::vector<HostTensorSFB> block_SFB;
cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
cutlass::DeviceAllocation<ElementA const*> ptr_A;
cutlass::DeviceAllocation<ElementB const*> ptr_B;
cutlass::DeviceAllocation<ElementSF const*> ptr_SFA;
cutlass::DeviceAllocation<ElementSF const*> ptr_SFB;
cutlass::DeviceAllocation<ElementC const*> ptr_C;
cutlass::DeviceAllocation<ElementD *> ptr_D;
cutlass::DeviceAllocation<ElementD *> ptr_ref_D;
cutlass::DeviceAllocation<StrideA> stride_A;
cutlass::DeviceAllocation<StrideB> stride_B;
cutlass::DeviceAllocation<StrideC> stride_C;
cutlass::DeviceAllocation<StrideD> stride_D;
cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
cutlass::DeviceAllocation<ElementAccumulator> block_beta;
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
// Command line options parsing
struct Options {
bool help = false;
bool skip_verification = false;
float alpha = 1.f, beta = 0.f;
int iterations = 1000;
int m = 1024, n = 512, k = 1024, l = 1, groups = 10;
std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
RasterOrderOptions raster_order = RasterOrderOptions::AlongN;
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
if (cmd.check_cmd_line_flag("skip-verification")) {
skip_verification = true;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("groups", groups);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
char raster_char;
cmd.get_cmd_line_argument("raster", raster_char, 'N');
if (raster_char == 'N' || raster_char == 'n') {
raster_order = RasterOrderOptions::AlongN;
} else if (raster_char == 'M' || raster_char == 'm') {
raster_order = RasterOrderOptions::AlongM;
}
for (int i = 0; i < groups; ++i) {
problem_sizes_host.push_back({m, n, k});
}
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "87c_blackwell_geforce_grouped_gemm_groupwise\n\n"
<< " Blackwell FP8 GEMM with Groupwise Scaling using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --groups=<int> Sets the number of individual GEMM problems for Grouped GEMM\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
<< " --skip-verification Skip verification.\n\n";
out
<< "\n\nExamples:\n\n"
<< "$ " << "87c_blackwell_geforce_grouped_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --groups=8 --alpha=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Two flops per multiply-add
uint64_t flop = uint64_t(2) * m * n * k * groups;
double gflop = double(flop) / double(1.0e9);
return gflop / runtime_s;
}
};
/// Result structure
struct Result {
double avg_runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
Result(
double avg_runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess)
:
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const Options &options) {
using namespace cute;
std::vector<ElementA *> ptr_A_host(options.groups);
std::vector<ElementB *> ptr_B_host(options.groups);
std::vector<ElementSF *> ptr_SFA_host(options.groups);
std::vector<ElementSF *> ptr_SFB_host(options.groups);
std::vector<ElementC *> ptr_C_host(options.groups);
std::vector<ElementD *> ptr_D_host(options.groups);
std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
block_alpha.reset(options.groups);
block_beta.reset(options.groups);
for (int i = 0; i < options.groups; ++i) {
auto problem = options.problem_sizes_host.at(i);
auto [M, N, K] = problem;
auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
auto stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
auto stride_D = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
auto layout_A = make_layout(make_shape(M, K, 1), stride_A);
auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
auto layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
auto layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
stride_A_host.push_back(stride_A);
stride_B_host.push_back(stride_B);
layout_SFA_host.push_back(layout_SFA);
layout_SFB_host.push_back(layout_SFB);
stride_C_host.push_back(stride_C);
stride_D_host.push_back(stride_D);
block_A.push_back(HostTensorA(cutlass::make_Coord(size(layout_A))));
block_B.push_back(HostTensorB(cutlass::make_Coord(size(layout_B))));
block_C.push_back(HostTensorC(cutlass::make_Coord(size(layout_C))));
block_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
block_SFA.push_back(HostTensorSFA(cutlass::make_Coord(size(filter_zeros(layout_SFA)))));
block_SFB.push_back(HostTensorSFB(cutlass::make_Coord(size(filter_zeros(layout_SFB)))));
block_ref_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
}
for (int i = 0; i < options.groups; ++i) {
initialize_tensor(block_A.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2022);
initialize_tensor(block_B.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2023);
initialize_tensor(block_C.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2024);
initialize_tensor(block_SFA.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2025);
initialize_tensor(block_SFB.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2026);
block_A.at(i).sync_device();
block_B.at(i).sync_device();
block_C.at(i).sync_device();
block_SFA.at(i).sync_device();
block_SFB.at(i).sync_device();
ptr_A_host.at(i) = block_A.at(i).device_data();
ptr_B_host.at(i) = block_B.at(i).device_data();
ptr_C_host.at(i) = block_C.at(i).device_data();
ptr_D_host.at(i) = block_D.at(i).device_data();
ptr_SFA_host.at(i) = block_SFA.at(i).device_data();
ptr_SFB_host.at(i) = block_SFB.at(i).device_data();
alpha_host.push_back((options.alpha == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
beta_host.push_back((options.beta == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
ptr_alpha_host.at(i) = block_alpha.get() + i;
ptr_beta_host.at(i) = block_beta.get() + i;
}
problem_sizes.reset(options.groups);
problem_sizes.copy_from_host(options.problem_sizes_host.data());
ptr_A.reset(options.groups);
ptr_A.copy_from_host(ptr_A_host.data());
ptr_B.reset(options.groups);
ptr_B.copy_from_host(ptr_B_host.data());
ptr_SFA.reset(options.groups);
ptr_SFA.copy_from_host(ptr_SFA_host.data());
ptr_SFB.reset(options.groups);
ptr_SFB.copy_from_host(ptr_SFB_host.data());
ptr_C.reset(options.groups);
ptr_C.copy_from_host(ptr_C_host.data());
ptr_D.reset(options.groups);
ptr_D.copy_from_host(ptr_D_host.data());
stride_A.reset(options.groups);
stride_A.copy_from_host(stride_A_host.data());
stride_B.reset(options.groups);
stride_B.copy_from_host(stride_B_host.data());
layout_SFA.reset(options.groups);
layout_SFA.copy_from_host(layout_SFA_host.data());
layout_SFB.reset(options.groups);
layout_SFB.copy_from_host(layout_SFB_host.data());
stride_C.reset(options.groups);
stride_C.copy_from_host(stride_C_host.data());
stride_D.reset(options.groups);
stride_D.copy_from_host(stride_D_host.data());
alpha_device.reset(options.groups);
alpha_device.copy_from_host(ptr_alpha_host.data());
beta_device.reset(options.groups);
beta_device.copy_from_host(ptr_beta_host.data());
block_alpha.copy_from_host(alpha_host.data());
block_beta.copy_from_host(beta_host.data());
}
/// Populates a Gemm::Arguments structure from the given commandline options
typename Gemm::Arguments args_from_options(const Options &options) {
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = 0;
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
scheduler.raster_order = options.raster_order;
typename Gemm::Arguments arguments;
decltype(arguments.epilogue.thread) fusion_args;
fusion_args.alpha_ptr = nullptr;
fusion_args.beta_ptr = nullptr;
if (options.alpha != std::numeric_limits<float>::max()) {
fusion_args.alpha = options.alpha;
fusion_args.alpha_ptr_array = nullptr;
fusion_args.dAlpha = {_0{}, _0{}, 0};
} else {
fusion_args.alpha = 0;
fusion_args.alpha_ptr_array = alpha_device.get();
fusion_args.dAlpha = {_0{}, _0{}, 1};
}
if (options.beta != std::numeric_limits<float>::max()) {
fusion_args.beta = options.beta;
fusion_args.beta_ptr_array = nullptr;
fusion_args.dBeta = {_0{}, _0{}, 0};
} else {
fusion_args.beta = 0;
fusion_args.beta_ptr_array = beta_device.get();
fusion_args.dBeta = {_0{}, _0{}, 1};
}
arguments = {
cutlass::gemm::GemmUniversalMode::kGrouped,
{options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
{ptr_A.get(), stride_A.get(),
ptr_B.get(), stride_B.get(),
ptr_SFA.get(), layout_SFA.get(),
ptr_SFB.get(), layout_SFB.get()},
{
fusion_args,
ptr_C.get(), stride_C.get(),
ptr_D.get(), stride_D.get()
},
hw_info, scheduler
};
return arguments;
}
bool verify(const Options &options) {
//
// Compute reference output
//
bool passed = true;
for (int i = 0; i < options.groups; ++i) {
auto problem = options.problem_sizes_host.at(i);
auto [M, N, K] = problem;
auto A = cute::make_tensor(block_A.at(i).host_data(),
cute::make_layout(cute::make_shape(M, K, 1), stride_A_host.at(i)));
auto B = cute::make_tensor(block_B.at(i).host_data(),
cute::make_layout(cute::make_shape(N, K, 1), stride_B_host.at(i)));
auto C = cute::make_tensor(block_C.at(i).host_data(),
cute::make_layout(cute::make_shape(M, N, 1), stride_C_host.at(i)));
auto D = cute::make_tensor(block_ref_D.at(i).host_data(),
cute::make_layout(cute::make_shape(M, N, 1), stride_D_host.at(i)));
auto SFA = cute::make_tensor(block_SFA.at(i).host_data(), layout_SFA_host.at(i));
auto SFB = cute::make_tensor(block_SFB.at(i).host_data(), layout_SFB_host.at(i));
cutlass::reference::host::GettBlockScalingMainloopParams<
ElementAccumulator,
decltype(A),
decltype(SFA),
decltype(B),
decltype(SFB)
> mainloop_params{A, SFA, B, SFB};
cutlass::reference::host::GettEpilogueParams<
ElementAccumulator,
ElementAccumulator,
ElementAccumulator,
ElementCompute,
decltype(C),
decltype(D)
> epilogue_params;
epilogue_params.C = C;
epilogue_params.D = D;
epilogue_params.alpha = alpha_host.at(i);
epilogue_params.beta = beta_host.at(i);
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
block_D.at(i).sync_host();
passed &= cutlass::reference::host::TensorEquals(block_ref_D.at(i).host_view(), block_D.at(i).host_view());
}
return passed;
}
/// Execute a given example GEMM computation
template <class Gemm>
int run(Options &options) {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto arguments = args_from_options(options);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check if the problem size is supported or not
CUTLASS_CHECK(gemm.can_implement(arguments));
// Initialize CUTLASS kernel with arguments and workspace pointer
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
// Correctness / Warmup iteration
CUTLASS_CHECK(gemm.run());
Result result;
if (!options.skip_verification) {
// Check if output from CUTLASS kernel and reference kernel are equal or not
result.passed = verify(options);
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
if (!result.passed) {
exit(-1);
}
}
// Run profiling loop
if (options.iterations > 0) {
GpuTimer timer;
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.run());
}
timer.stop();
// Compute average runtime and GFLOPs.
float elapsed_ms = timer.elapsed_millis();
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << " " << options.groups << " Groups" << std::endl;
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
// or CUDA 12.9 or higher for SM121 support.
// Must have compute capability at least 120.
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#endif
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
return 0;
}
//
// Parse options
//
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
//
// Run
//
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
initialize(options);
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,47 @@
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
cutlass_example_add_executable(
87a_blackwell_geforce_fp8_bf16_gemm_blockwise
87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
)
cutlass_example_add_executable(
87b_blackwell_geforce_fp8_bf16_gemm_groupwise
87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
)
cutlass_example_add_executable(
87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise
87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
)
endif()

View File

@ -0,0 +1,83 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/// Helper to initialize a block of device data
template <class Element, class Layout>
bool initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
double scope_max, scope_min;
int bits_input = cutlass::sizeof_bits<Element>::value;
if (bits_input == 1) {
scope_max = 2;
scope_min = 0;
}
else if (bits_input <= 8) {
scope_max = 2;
scope_min = -2;
}
else if (bits_input == 16) {
scope_max = 5;
scope_min = -5;
}
else {
scope_max = 8;
scope_min = -8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope_max, scope_min, 0);
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else {
throw std::runtime_error("Not implementated.");
}
return true;
}

View File

@ -0,0 +1,545 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM103 architecture.
This example demonstrates a simple way to instantiate and run a blockscaled 3xFP4 GEMM on the NVIDIA Blackwell SM103 architecture.
Usage:
$ ./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm --m=2048 --n=2048 --k=2048
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/detail/sm100_blockscaled_layout.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include <iostream>
#include "helper.h"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
// A matrix configuration
using ElementA = cutlass::float_e2m1_t; // Element type for A matrix operand
using ElementSFA = cutlass::float_ue4m3_t;
using LayoutATag = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 32; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::float_e2m1_t; // Element type for A matrix operand
using ElementSFB = cutlass::float_ue4m3_t;
using LayoutBTag = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 32; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C/D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operand
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operand
using LayoutCTag = cutlass::layout::RowMajor; // Layout type for C matrix operand
using LayoutDTag = cutlass::layout::RowMajor; // Layout type for D matrix operand
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// Kernel functional config
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm103; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
// using ElementD = cutlass::float_e2m1_t; // Enable for SF Output // Element type for D matrix operands
// Kernel Perf config
using MmaTileShape = cute::Shape<cute::_128, cute::_128, Int<768>>; // MMA's tile size
using ClusterShape = cute::Shape<cute::_2, cute::_4, cute::_1>; // Shape of the threadblocks in a cluster
// Epilogue fusion operator
using EpilogueFusionOp = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutCTag, AlignmentC,
ElementD, LayoutDTag, AlignmentD,
cutlass::epilogue::NoSmemWarpSpecialized1Sm,
EpilogueFusionOp
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
cute::tuple<ElementA,ElementSFA>, LayoutATag, AlignmentA,
cute::tuple<ElementB,ElementSFB>, LayoutBTag, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103 // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>, // Indicates ProblemShape
CollectiveMainloop,
CollectiveEpilogue>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Reference device GEMM implementation type
using StrideA = typename Gemm::GemmKernel::StrideA;
using LayoutA = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
using StrideB = typename Gemm::GemmKernel::StrideB;
using LayoutB = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
using StrideC = typename Gemm::GemmKernel::StrideC;
using LayoutC = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
using StrideD = typename Gemm::GemmKernel::StrideD;
using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
//
// Data members
//
/// Initialization
StrideA stride_A;
LayoutA layout_A;
LayoutSFA layout_SFA;
StrideB stride_B;
LayoutB layout_B;
LayoutSFB layout_SFB;
StrideC stride_C;
LayoutC layout_C;
StrideD stride_D;
LayoutD layout_D;
uint64_t seed;
// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout> block_A;
cutlass::HostTensor<ElementSFA, cutlass::layout::PackedVectorLayout> block_SFA;
cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout> block_B;
cutlass::HostTensor<ElementSFB, cutlass::layout::PackedVectorLayout> block_SFB;
cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
// Output Tensor
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
// Reference Output Tensor
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
template <typename T>
auto make_iterator(T* ptr) {
return cute::recast_ptr<T>(ptr);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
float alpha, beta;
int iterations;
int m, n, k;
int swizzle = 0;
Options():
help(false),
m(1024), n(1024), k(1024),
alpha(1.f), beta(0.f),
iterations(10),
swizzle(0)
{ }
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("n", n);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations);
cmd.get_cmd_line_argument("swizzle", swizzle);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "89_sm103_fp4_ultra_gemm\n\n"
<< " Sm103 3xFP4 GEMM using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n"
<< " --swizzle=<int> Cluster rasterization swizzle\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ " << "./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const
{
// Two flops per multiply-add
uint64_t flop = uint64_t(2) * m * n * k;
double gflop = double(flop) / double(1.0e9);
return gflop / runtime_s;
}
};
/// Result structure
struct Result
{
double avg_runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
Result(
double avg_runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess)
:
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
{}
};
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to initialize a block of device data
template <typename Element, typename Layout>
bool initialize_block(
cutlass::TensorView<Element, Layout> view,
uint64_t seed) {
double scope_max, scope_min;
constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
if constexpr (bits_input == 1) {
scope_max = 2;
scope_min = 0;
}
else if constexpr (bits_input <= 6) {
scope_max = 2;
scope_min = -2;
}
else if constexpr (bits_input <= 8) {
if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
scope_max = 4;
scope_min = 1;
}
else {
scope_max = 1;
scope_min = -1;
}
}
else{
scope_max = 4;
scope_min = -4;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope_max, scope_min, 0);
return true;
}
/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const Options &options) {
using namespace cute;
// For SFA and SFB tensors layouts
using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
block_A.reset(cutlass::make_Coord(size(layout_A)));
block_B.reset(cutlass::make_Coord(size(layout_B)));
block_C.reset(cutlass::make_Coord(size(layout_C)));
block_D.reset(cutlass::make_Coord(size(layout_D)));
block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
initialize_block(block_A.host_view(), seed + 2021);
initialize_block(block_B.host_view(), seed + 2022);
initialize_block(block_C.host_view(), seed + 2023);
initialize_block(block_SFA.host_view(), seed + 2024);
initialize_block(block_SFB.host_view(), seed + 2025);
block_A.sync_device();
block_B.sync_device();
block_C.sync_device();
block_SFA.sync_device();
block_SFB.sync_device();
}
// Populates a Gemm::Arguments structure from the given commandline options
typename Gemm::Arguments args_from_options(const Options &options)
{
typename Gemm::Arguments arguments {
cutlass::gemm::GemmUniversalMode::kGemm,
{options.m, options.n, options.k, 1},
{ // Mainloop arguments
block_A.device_data(), stride_A,
block_B.device_data(), stride_B,
block_SFA.device_data(), layout_SFA,
block_SFB.device_data(), layout_SFB
},
{ // Epilogue arguments
{options.alpha, options.beta},
block_C.device_data(), stride_C,
block_D.device_data(), stride_D
}
};
arguments.scheduler.max_swizzle_size = options.swizzle;
return arguments;
}
bool verify(const Options &options) {
using namespace cute;
// Create the arguments for host reference implementation
Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
cutlass::reference::host::GettBlockScalingMainloopParams<
ElementAccumulator, // ElementAccumulator
decltype(tensor_A), // TensorA
decltype(tensor_SFA), // TensorSfA
decltype(tensor_B), // TensorB
decltype(tensor_SFB) // TensorSfB
> mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
cutlass::reference::host::GettBlockScalingEpilogueParams<
ElementAccumulator, // ElementScalar
ElementAccumulator, // ElementAccumulator
ElementAccumulator, // ElementCompute
decltype(tensor_C), // TensorC
decltype(tensor_D) // TensorD
> epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
// Comparison
block_D.sync_host();
bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
return passed;
}
/// Execute a given example GEMM computation
template <typename Gemm>
int run(Options &options)
{
initialize(options);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto arguments = args_from_options(options);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
uint8_t* workspace = nullptr;
cudaError_t status = cudaMalloc(&workspace, workspace_size);
if (status != cudaSuccess) {
std::cerr << "Failed to allocate workspace memory: " << cudaGetErrorString(status) << std::endl;
return -1;
}
// Check if the problem size is supported or not
CUTLASS_CHECK(gemm.can_implement(arguments));
// Initialize CUTLASS kernel with arguments and workspace pointer
CUTLASS_CHECK(gemm.initialize(arguments, workspace));
// Correctness / Warmup iteration
CUTLASS_CHECK(gemm.run());
// Free workspace memory
cudaFree(workspace);
cudaDeviceSynchronize();
// Check if output from CUTLASS kernel and reference kernel are equal or not
Result result;
result.passed = verify(options);
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
if (!result.passed) {
exit(-1);
}
// Run profiling loop
if (options.iterations > 0)
{
GpuTimer timer;
timer.start();
for (int iter = 0; iter < options.iterations; ++iter) {
CUTLASS_CHECK(gemm.initialize(arguments, workspace));
CUTLASS_CHECK(gemm.run());
}
timer.stop();
// Compute average runtime and GFLOPs.
float elapsed_ms = timer.elapsed_millis();
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
std::cout << " GFLOPS: " << result.gflops << std::endl;
}
return 0;
}
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// CUTLASS must be compiled with CUDA 12.9 or higher Toolkit to run this example
// and must have compute capability at least 100.
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
std::cerr << "This example requires CUDA 12.9 or newer." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
cudaDeviceProp props;
int current_device_id;
CUDA_CHECK(cudaGetDevice(&current_device_id));
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
if (!(props.major == 10 && props.minor == 3)) {
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 103)." << std::endl;
return 0;
}
//
// Parse options
//
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
//
// Evaluate CUTLASS kernels
//
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
run<Gemm>(options);
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,38 @@
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (CUTLASS_NVCC_ARCHS MATCHES 103a)
cutlass_example_add_executable(
89_sm103_fp4_ultra_gemm
89_sm103_fp4_ultra_gemm.cu
)
endif()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
set(TEST_RANDOM_SMALL_GROUP --groups=3 --iterations=1) # Random problem sizes
set(TEST_EPILOGUE_SMALL_GROUP --alpha=1.5 --beta=2.0 --groups=3 --iterations=1) # Random problem sizes
if (CUTLASS_NVCC_ARCHS MATCHES 103a)
cutlass_example_add_executable(
90_sm103_fp4_ultra_grouped_gemm
90_sm103_fp4_ultra_grouped_gemm.cu
TEST_COMMAND_OPTIONS
TEST_RANDOM_SMALL_GROUP
TEST_EPILOGUE_SMALL_GROUP
)
endif()

View File

@ -0,0 +1,898 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <cstdint> // uint64_t
#include <cstdio>
#include <cstdlib> // rand(), RAND_MAX
#include <string> // std::stoi
#include <vector>
#include <iostream>
#include <float.h>
#include <optional>
#include "cutlass/util/command_line.h"
// clang-format off
#include "cute/tensor.hpp" // FIX cute header file inclusion issue
// clang-format on
#include "cute/arch/mma_sm100_desc.hpp" // cute::UMMA::Major
#include "cute/numeric/numeric_types.hpp" // cute::sizeof_bits_v
#include "cutlass/complex.h" // cutlass::ComplexTransform
#include "cutlass/cutlass.h" // cutlass::Status
#include "cutlass/detail/sm100_blockscaled_layout.hpp" // cutlass::detail::Sm1xxBlockScaledOutputConfig
#include "cutlass/epilogue/thread/linear_combination.h" // cutlass::epilogue::thread::LinearCombination
#include "cutlass/gemm/device/gemv_blockscaled.h" // cutlass::gemm::device::Gemv
#include "cutlass/gemm/kernel/gemv_blockscaled.h" // cutlass::gemm::kernel::Gemv
#include "cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h" // cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor
#include "cutlass/gemm_coord.h" // cutlass::GemmCoord
#include "cutlass/layout/matrix.h" // cutlass::layout::Affine2Layout_Factory
#include "cutlass/numeric_size.h" // cutlss::is_subbyte
#include "cutlass/numeric_types.h"
#include "cutlass/platform/platform.h" // cutlass::is_same_v
#include "cutlass/util/device_memory.h" // cutlass::device_memory::allocation
#include "cutlass/util/distribution.h" // cutlass::Distribution
#include "cutlass/util/host_tensor.h" // cutlass::HostTensor
#include "cutlass/util/packed_stride.hpp" // cutlass::make_cute_packed_stride
#include "cutlass/util/reference/host/gemm_complex.h" // cutlass::reference::host::GemmComplex
#include <cutlass/util/reference/host/gett.hpp> // cutlass::reference::host::GettBlockScalingMainloopParams
// cutlass::reference::host::GettBlockScalingEpilogueParams
// cutlass::reference::host::Gemm3x
#include "cutlass/util/reference/host/tensor_compare.h" // cutlass::reference::host::TensorEquals
#include "cutlass/util/reference/host/tensor_fill.h" // cutlass::reference::host::TensorFillRandomUniform
#include "cutlass/numeric_size.h" // cutlass::bits_to_bytes
// Helper Functions
template <typename T>
auto
make_iterator(T* ptr)
{
return cute::recast_ptr<T>(ptr);
}
template <typename Element, typename Layout>
bool
initialize_tensor(cutlass::TensorView<Element, Layout> view, cutlass::Distribution::Kind dist_kind, uint64_t seed)
{
if (dist_kind == cutlass::Distribution::Uniform) {
double scope_max, scope_min;
int bits_input = cutlass::sizeof_bits<Element>::value;
if (bits_input == 1) {
scope_max = 2;
scope_min = 0;
} else if (bits_input <= 6) {
scope_max = 2;
scope_min = -2;
} else if (bits_input <= 8) {
if constexpr (cutlass::is_same_v<Element, cutlass::float_ue4m3_t> ||
cutlass::is_same_v<Element, cutlass::float_ue8m0_t>) {
scope_max = 4;
scope_min = 1;
} else {
scope_max = 1;
scope_min = -1;
}
} else {
scope_max = 4;
scope_min = -4;
}
cutlass::reference::host::TensorFillRandomUniform(view, seed, scope_max, scope_min, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else {
CUTLASS_ASSERT(false);
return false;
}
return true;
}
// Base class of Testbed
template <
typename Gemv_,
// The following types are more difficult to be derived from EVT
typename ElementC, typename LayoutC, typename ElementD_,
typename LayoutD, typename ElementSFD_, typename LayoutSFD,
typename ElementCompute_, int kVectorSize_>
struct TestbedGemvFp4SFDBase
{
public:
using Gemv = Gemv_;
using ElementA = typename Gemv::ElementA;
using ElementSFA = typename Gemv::ElementSFA;
using LayoutA = typename Gemv::LayoutA;
static_assert(cutlass::is_same_v<LayoutA, cutlass::layout::RowMajor>, "only support row major matrix A");
static_assert(cutlass::sizeof_bits<ElementSFA>::value == 8, "ElementSFA should be FP8 type");
using ElementB = typename Gemv::ElementB;
using ElementSFB = typename Gemv::ElementSFB;
using LayoutB = cutlass::layout::ColumnMajor;
static_assert(cutlass::is_same_v<ElementA, ElementB>, "only support ElementA ElementB of same type");
static_assert(cutlass::sizeof_bits<ElementSFB>::value == 8, "ElementSFB should be FP8 type");
static_assert(cutlass::is_same_v<LayoutC, cutlass::layout::ColumnMajor>, "only support col major output D");
using ElementD = ElementD_;
static_assert(cutlass::is_same_v<LayoutD, cutlass::layout::ColumnMajor>, "only support col major output D");
using ElementSFD = ElementSFD_;
static_assert(cutlass::is_same_v<LayoutSFD, cutlass::layout::ColumnMajor>, "only support col major output SFD");
static_assert(cutlass::sizeof_bits<ElementSFD>::value, "only support 8 bit SFD");
using ElementAccumulator = typename Gemv::ElementAccumulator;
using ElementCompute = ElementCompute_;
static_assert(cutlass::is_same_v<ElementCompute, float>, "only support fp32 epi compute");
static constexpr int kVectorSize = kVectorSize_;
static_assert(kVectorSize == 16, "only support vs 16");
// SFD Config
static constexpr bool kIsKMajorSFD = cutlass::is_same_v<LayoutSFD, cutlass::layout::RowMajor>;
using Sm1xxBlockScaledOutputConfig=
cutlass::detail::Sm1xxBlockScaledOutputConfig<kVectorSize,
kIsKMajorSFD ? cute::UMMA::Major::K : cute::UMMA::Major::MN>;
using Blk_MN_Output = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
using Blk_SF_Output = typename Sm1xxBlockScaledOutputConfig::Blk_SF;
using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
// SFA SFB Config
using Sm100BlockScaledInputConfig = cutlass::detail::Sm1xxBlockScaledConfig<kVectorSize>;
using Blk_MN_Input = typename Sm100BlockScaledInputConfig::Blk_MN;
using Blk_SF_Input = typename Sm100BlockScaledInputConfig::Blk_SF;
using SfAtom_Input = typename Sm100BlockScaledInputConfig::SfAtom;
public:
TestbedGemvFp4SFDBase(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_D_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_SFA_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_SFB_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_SFD_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2023)
: init_A(init_A_)
, init_B(init_B_)
, init_C(init_C_)
, init_D(init_D_)
, init_SFA(init_SFA_)
, init_SFB(init_SFB_)
, init_SFD(init_SFD_)
, seed(seed_)
{
}
bool initialize(cutlass::MatrixCoord problem_size, int32_t batch_count)
{
const int32_t gemm_m = problem_size.row();
const int32_t gemm_k = problem_size.column();
const int32_t gemm_n = 1;
const int32_t gemm_batch = batch_count;
// Resize Config SFA/SFB
auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
auto sfa_coord = cutlass::make_Coord(m_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
auto sfb_coord = cutlass::make_Coord(n_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
auto sfa_resize_layout =
cutlass::layout::Affine2Layout_Factory<LayoutA>::layout_factory(sfa_coord, typename LayoutA::Stride{});
auto sfb_resize_layout =
cutlass::layout::Affine2Layout_Factory<LayoutB>::layout_factory(sfb_coord, typename LayoutB::Stride{});
// Use the same SFD layout generation as reference for tensor creation
using ProblemShapeType = cute::Shape<int, int, int, int>;
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
// Generate the same layout as reference uses
auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
// Extract size from the generated layout and create coordinate
auto sfd_size = cute::size(cute::filter_zeros(sfd_layout));
auto sfd_coord = cutlass::make_Coord(sfd_size, 1); // Linear layout for HostTensor
auto sfd_resize_layout =
cutlass::layout::Affine2Layout_Factory<LayoutSFD>::layout_factory(sfd_coord, typename LayoutSFD::Stride{});
// Resize Host
this->reference_D.resize({gemm_batch * gemm_m, 1}); // D col major vector
this->reference_SFD.resize(sfd_coord, sfd_resize_layout);
if (initialize_tensor(this->reference_D.host_view(), this->init_D, this->seed + 7) == false) {
printf("initialize_tensor() REF D failed\n");
return false;
}
if (initialize_tensor(this->reference_SFD.host_view(), this->init_SFD, this->seed + 9) == false) {
printf("initialize_tensor() REF SFD failed\n");
return false;
}
// Resize A/B/C/D
this->tensor_A.resize({gemm_batch * gemm_m, gemm_k}); // A row major
this->tensor_B.resize({gemm_batch * gemm_k, 1}); // B col major vector
this->tensor_C.resize({gemm_batch * gemm_m, 1}); // C col major vector
this->tensor_D.resize({gemm_batch * gemm_m, 1}); // D col major vector
this->tensor_SFA.resize(sfa_coord, sfa_resize_layout);
this->tensor_SFB.resize(sfb_coord, sfb_resize_layout);
this->tensor_SFD.resize(sfd_coord, sfd_resize_layout);
// Fill A/B/C
if (initialize_tensor(this->tensor_A.host_view(), this->init_A, this->seed + 1) == false) {
printf("initialize_tensor() A failed\n");
return false;
}
if (initialize_tensor(this->tensor_B.host_view(), this->init_B, this->seed + 2) == false) {
printf("initialize_tensor() B failed\n");
return false;
}
if (initialize_tensor(this->tensor_C.host_view(), this->init_C, this->seed + 3) == false) {
printf("initialize_tensor() C failed\n");
return false;
}
// Fill SFA/SFB
if (initialize_tensor(this->tensor_SFA.host_view(), this->init_SFA, this->seed + 4) == false) {
printf("initialize_tensor() SFA failed\n");
return false;
}
if (initialize_tensor(this->tensor_SFB.host_view(), this->init_SFB, this->seed + 5) == false) {
printf("initialize_tensor() SFB failed\n");
return false;
}
// Fill D/SFD
if (initialize_tensor(this->tensor_D.host_view(), this->init_D, this->seed + 6) == false) {
printf("initialize_tensor() D failed\n");
return false;
}
if (initialize_tensor(this->tensor_SFD.host_view(), this->init_SFD, this->seed + 8) == false) {
printf("initialize_tensor() SFD failed\n");
return false;
}
// Copy A/B/C from host to device
this->tensor_A.sync_device();
this->tensor_B.sync_device();
this->tensor_C.sync_device();
this->tensor_D.sync_device();
this->tensor_SFA.sync_device();
this->tensor_SFB.sync_device();
this->tensor_SFD.sync_device();
// SFD initialization is different.
// Init referenceSFD on host first, and then copy data to tensorSFD device side.
// This ensures tensorSFD and referenceSFD to have same data,
// otherwise the "bubbles" due to SFD layouts can lead to false negative sanity check.
cutlass::device_memory::copy_to_host(this->reference_SFD.host_data(), this->tensor_SFD.device_data(), sfd_size);
return true;
}
bool compare_reference()
{
// device -> host
this->tensor_D.sync_host();
bool passed = true;
// Check
passed = cutlass::reference::host::TensorEquals(this->reference_D.host_view(), this->tensor_D.host_view());
if (passed == false) {
printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
printf("tensorD mismatch\n");
return false;
}
this->tensor_SFD.sync_host();
passed = cutlass::reference::host::TensorEquals(this->reference_SFD.host_view(), this->tensor_SFD.host_view());
if (passed == false) {
printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
printf("tensorSFD mismatch\n");
return false;
}
return passed;
}
bool run_reference(cutlass::MatrixCoord problem_size,
int32_t batch_count,
ElementCompute alpha,
ElementCompute beta,
float epilogue_st)
{
const int32_t gemm_m = problem_size.row();
const int32_t gemm_k = problem_size.column();
const int32_t gemm_n = 1;
const int32_t gemm_batch = batch_count;
// Run reference blockscale GETT
using ProblemShapeType = cute::Shape<int, int, int, int>;
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
auto SfD = make_tensor(make_iterator(this->reference_SFD.host_data()),
Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
using StrideD = cutlass::gemm::TagToStrideC_t<LayoutD>;
StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(gemm_m, gemm_k, gemm_batch));
StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(gemm_n, gemm_k, gemm_batch));
StrideC stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
auto A = make_tensor(make_iterator(this->tensor_A.host_data()),
cute::make_layout(cute::make_shape(gemm_m, gemm_k, gemm_batch), stride_a));
auto B = make_tensor(make_iterator(this->tensor_B.host_data()),
cute::make_layout(cute::make_shape(gemm_n, gemm_k, gemm_batch), stride_b));
auto C = cute::make_tensor(make_iterator(this->tensor_C.host_data()),
cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_c));
auto D = cute::make_tensor(make_iterator(this->reference_D.host_data()),
cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_d));
auto layout_sfa = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
auto layout_sfb = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
auto SfA = make_tensor(this->tensor_SFA.host_data(), layout_sfa);
auto SfB = make_tensor(this->tensor_SFB.host_data(), layout_sfb);
// Internally scale factor of mainloop will be disabled when ElementA/B == ElementSFA/B.
typename cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator, // ElementAccumulator
decltype(A), // TensorA
decltype(SfA), // TensorSfA
decltype(B), // TensorB
decltype(SfB) // TensorSfB
>
mainloop_params{A, SfA, B, SfB};
typename cutlass::reference::host::GettBlockScalingEpilogueParams<ElementCompute, // ElementScalar
ElementAccumulator, // ElementAccumulator
ElementCompute, // ElementCompute
decltype(C), // TensorC
decltype(D), // TensorD
decltype(SfD), // TensorSfD
cute::Int<kVectorSize>, // OutputVectorSize
cutlass::reference::host::SfStrategy::SfDGen
>
epilogue_params{alpha, beta, C, D, SfD, epilogue_st};
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
return true;
}
virtual typename Gemv::Arguments get_arguments(
cutlass::MatrixCoord problem_size, int32_t batch_count,
float epilogue_st, ElementCompute alpha, ElementCompute beta) = 0;
bool run_gemv(cutlass::MatrixCoord problem_size,
int32_t batch_count,
ElementCompute alpha,
ElementCompute beta,
[[maybe_unused]] float epilogue_st,
bool is_profiling,
int kIterations)
{
// Not support batch input for testing
const int32_t gemm_m = problem_size.row();
const int32_t gemm_k = problem_size.column();
[[maybe_unused]] const int32_t gemm_n = 1;
[[maybe_unused]] const int32_t gemm_batch = batch_count;
Gemv gemv_op;
typename Gemv::Arguments arguments = this->get_arguments(
problem_size, batch_count, epilogue_st, alpha, beta
);
cutlass::Status status = gemv_op.can_implement(arguments);
if (status != cutlass::Status::kSuccess) {
printf("can_implement() failed\n");
return false;
}
size_t workspace_size = Gemv::get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
status = gemv_op.initialize(arguments, workspace.get());
if (status != cutlass::Status::kSuccess) {
printf("initialize() failed\n");
return false;
}
if (not is_profiling) {
status = gemv_op();
}
// profiling
else {
cudaError_t result;
cudaEvent_t events[2];
for (cudaEvent_t &evt : events) {
result = cudaEventCreate(&evt);
if (result != cudaSuccess) {
std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
}
// warmup
status = gemv_op();
if (status != cutlass::Status::kSuccess) {
std::cerr << "Device execution failed on warmup." << std::endl;
return false;
}
result = cudaEventRecord(events[0]);
if (result != cudaSuccess) {
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
for (int iter_i = 0; iter_i < kIterations; ++iter_i) {
status = gemv_op();
if (status != cutlass::Status::kSuccess) {
std::cerr << "Device execution failed." << std::endl;
return false;
}
}
result = cudaEventRecord(events[1]);
if (result != cudaSuccess) {
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
float elapsed_ms = 0;
result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
if (result != cudaSuccess) {
std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
for (cudaEvent_t &evt : events) {
result = cudaEventDestroy(evt);
if (result != cudaSuccess) {
std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
}
int64_t flops = int64_t(gemm_m) * gemm_n * gemm_k * 2;
int64_t bytes = cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementA>) * int64_t(gemm_m) * int64_t(gemm_k)) +
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementB>) * int64_t(gemm_k) * int64_t(gemm_n)) +
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementD>) * int64_t(gemm_m) * int64_t(gemm_n)) +
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFA>) * int64_t(gemm_m) * int64_t(gemm_k) / int64_t(kVectorSize)) +
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFB>) * int64_t(gemm_k) * int64_t(gemm_n) / int64_t(kVectorSize)) +
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFD>) * int64_t(gemm_m) * int64_t(gemm_n) / int64_t(kVectorSize));
double gflops_per_second = double(flops) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1.0e9);
double gbytes_per_second = double(bytes) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1 << 30);
double elapsed_ms_per_iter = double(elapsed_ms) / kIterations;
std::cout << " Problem: "
<< gemm_m << "-by-" << gemm_n << "-by-" << gemm_k
<< ", batch size: " << gemm_batch
<< std::endl;
std::cout << " Runtime: " << elapsed_ms_per_iter << " ms" << std::endl;
std::cout << " GFLOPs: " << gflops_per_second << " GFLOPs" << std::endl;
std::cout << "Memory bandwidth: " << gbytes_per_second << " GiB/s" << std::endl;
}
if (status != cutlass::Status::kSuccess) {
printf("gemv exec failed\n");
return false;
}
return true;
}
bool run_and_verify(cutlass::MatrixCoord problem_size,
int32_t batch_count,
ElementCompute alpha,
ElementCompute beta,
float epilogue_st)
{
// Initialize Data
if (this->initialize(problem_size, batch_count) == false) {
return false;
}
// Run GEMV kernel
if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, false /*is_profiling*/, 1) == false) {
return false;
}
// Run Reference Kernel
if (this->run_reference(problem_size, batch_count, alpha, beta, epilogue_st) == false) {
printf("run_reference() failed\n");
return false;
}
// Verify
if (this->compare_reference() == false) {
printf("compare_reference() failed\n");
return false;
}
return true;
}
bool profile(cutlass::MatrixCoord problem_size,
int32_t batch_count,
ElementCompute alpha,
ElementCompute beta,
float epilogue_st,
int kIterations = 10)
{
// Initialize Data
if (this->initialize(problem_size, batch_count) == false) {
return false;
}
// Profile GEMV kernel
if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, true /*is_profiling*/, kIterations) == false) {
return false;
}
return true;
}
public:
// Data Storage
cutlass::HostTensor<ElementA, LayoutA> tensor_A;
cutlass::HostTensor<ElementSFA, LayoutA> tensor_SFA;
cutlass::HostTensor<ElementB, LayoutB> tensor_B;
cutlass::HostTensor<ElementSFB, LayoutB> tensor_SFB;
cutlass::HostTensor<ElementC, LayoutC> tensor_C;
cutlass::HostTensor<ElementD, LayoutD> tensor_D;
cutlass::HostTensor<ElementSFD, LayoutD> tensor_SFD;
cutlass::HostTensor<ElementD, LayoutD> reference_D;
cutlass::HostTensor<ElementSFD, LayoutD> reference_SFD;
// Data Init Setting
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_D;
cutlass::Distribution::Kind init_SFA;
cutlass::Distribution::Kind init_SFB;
cutlass::Distribution::Kind init_SFD;
uint64_t seed;
};
template<typename Gemv_>
struct TestbedGemvFp4SFD : public TestbedGemvFp4SFDBase<
Gemv_,
typename Gemv_::ElementC,
typename Gemv_::EpilogueOutputOp::LayoutOutput,
typename Gemv_::EpilogueOutputOp::ElementD,
typename Gemv_::EpilogueOutputOp::LayoutOutput,
typename Gemv_::EpilogueOutputOp::ElementSFD,
typename Gemv_::EpilogueOutputOp::LayoutSFD,
typename Gemv_::EpilogueOutputOp::ElementCompute,
Gemv_::EpilogueOutputOp::kVectorSize
> {
using Base = TestbedGemvFp4SFDBase<
Gemv_,
typename Gemv_::ElementC,
typename Gemv_::EpilogueOutputOp::LayoutOutput,
typename Gemv_::EpilogueOutputOp::ElementD,
typename Gemv_::EpilogueOutputOp::LayoutOutput,
typename Gemv_::EpilogueOutputOp::ElementSFD,
typename Gemv_::EpilogueOutputOp::LayoutSFD,
typename Gemv_::EpilogueOutputOp::ElementCompute,
Gemv_::EpilogueOutputOp::kVectorSize
>;
using Base::Base;
using Gemv = Gemv_;
using ElementCompute = typename Base::ElementCompute;
using SfAtom_Input = typename Base::SfAtom_Input;
using Blk_MN_Input = typename Base::Blk_MN_Input;
using Blk_SF_Input = typename Base::Blk_SF_Input;
static constexpr int kVectorSize = Base::kVectorSize;
typename Gemv::Arguments get_arguments(
cutlass::MatrixCoord problem_size,
int32_t batch_count, float epilogue_st,
ElementCompute alpha, ElementCompute beta) override {
const int32_t gemm_m = problem_size.row();
const int32_t gemm_k = problem_size.column();
[[maybe_unused]] const int32_t gemm_n = 1;
[[maybe_unused]] const int32_t gemm_batch = batch_count;
auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
int batch_stride_SFA = m_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
int batch_stride_SFB = n_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
// Use the same SFD layout generation as reference to get correct batch stride
using ProblemShapeType = cute::Shape<int, int, int, int>;
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
// Generate the same layout as reference uses
using Sm1xxBlockScaledOutputConfig = typename Base::Sm1xxBlockScaledOutputConfig;
auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
// Calculate batch stride from the generated layout
// Extract the batch stride from the 3rd dimension stride
// The stride<2> gives us the stride for the batch dimension
auto batch_stride_tuple = cute::stride<2>(sfd_layout); // This returns (_0, 8192)
int batch_stride_SFD = static_cast<int>(cute::get<1>(batch_stride_tuple)); // Extract the 8192 part
// Initialize GEMV kernel
typename Gemv::Arguments arguments{
problem_size, // problem_size
batch_count, // batch_count
typename Gemv::EpilogueOutputOp::Params{
this->tensor_D.device_ref(), // tensor_d
this->tensor_SFD.device_data(), // scale_factor_d_ptr
alpha, // alpha
beta, // beta
epilogue_st, // st
batch_stride_SFD, // batch_stride_sfd
gemm_m // stride_d
},
this->tensor_A.device_ref(), // ref_A
this->tensor_B.device_data(), // ptr_B
this->tensor_C.device_data(), // ptr_C
this->tensor_D.device_data(), // ptr_D
this->tensor_SFA.device_data(), // ptr_SFA
this->tensor_SFB.device_data(), // ptr_SFB
gemm_k, // stride_A
gemm_m * gemm_k, // batch_stride_A
gemm_k, // batch_stride_B
gemm_m, // batch_stride_C
gemm_m, // batch_stride_D
batch_stride_SFA, // batch_stride_SFA
batch_stride_SFB, // batch_stride_SFB
batch_stride_SFD // batch_stride_SFD
};
return arguments;
}
};
struct Options {
bool help = false;
int m = 4096;
int k = 2048;
int n = 1;
int batch = 1;
float alpha = 1.0f;
float beta = 0.0f;
float epilogue_st = -1.0f; // sentinel for random
bool profiling = true;
int iterations = 10;
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}
cmd.get_cmd_line_argument("m", m);
cmd.get_cmd_line_argument("k", k);
cmd.get_cmd_line_argument("batch", batch);
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("epilogue_st", epilogue_st);
cmd.get_cmd_line_argument("profiling", profiling);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "91_fp4_gemv\n\n"
<< " FP4 GEMV with block-scaled inputs and outputs.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --batch=<int> Sets the batch count of the GEMM\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n"
<< " --epilogue_st=<f32> Epilogue ST value\n\n"
<< " --profiling=<bool> Whether to run profiling\n\n"
<< " --iterations=<int> Number of profiling iterations to perform\n\n";
out
<< "\n\nExamples:\n\n"
<< "$ " << "91_fp4_gemv" << " --m=4096 --k=2048 --batch=1 \n\n";
return out;
}
};
bool
run_fp4_gemv_device(Options const& options)
{
CUTLASS_ASSERT(options.n == 1);
using ElementA = cutlass::float_e2m1_t;
using ElementSFA = cutlass::float_e4m3_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::float_e2m1_t;
using ElementSFB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e2m1_t;
using ElementD = cutlass::float_e2m1_t;
using LayoutD = cutlass::layout::ColumnMajor;
using ElementSFD = cutlass::float_e4m3_t;
// Indicate SF is computed along col dim. Does NOT indicate actual layout of SFD
using LayoutSFD = cutlass::layout::ColumnMajor;
using ElementAccumulatorMainloop = cutlass::half_t;
using ElementAccumulator = float;
using ElementCompute = float;
ElementCompute alpha{options.alpha};
ElementCompute beta{options.beta};
// Must be a positive number.
const float epilogue_st = options.epilogue_st < 0.f ?
static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / 5)) :
options.epilogue_st;
static constexpr int kVectorSize = 16;
static constexpr int kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementA>::value;
using ThreadShape = cutlass::gemm::GemmShape<16, 8>;
static_assert(kVectorSize == ThreadShape::kM, "vector size and thread in row should be equal");
// Construct Epilogue
using EpilogueOp = typename cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor<kVectorSize,
ThreadShape,
ElementCompute,
ElementAccumulator,
ElementC,
ElementD,
ElementSFD,
LayoutD,
LayoutSFD>;
// Construct Mainloop
using Gemv = cutlass::gemm::device::GemvBlockScaled<
cutlass::gemm::kernel::
GemvBlockScaled<ElementA, LayoutA, ElementB, ElementD, ElementAccumulatorMainloop, EpilogueOp, kElementsPerAccess>>;
TestbedGemvFp4SFD<Gemv> testbed;
bool pass = true;
if (options.profiling) {
pass = testbed.profile(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st, options.iterations);
}
else {
pass = testbed.run_and_verify(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st);
}
return pass;
}
int
main(int argc, char const** argv)
{
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
// Run verification
Options verification_options = options;
verification_options.profiling = false;
bool passed = run_fp4_gemv_device(verification_options);
if (passed == false) {
printf("test fail\n");
return 1;
} else {
printf("test pass\n");
}
if (options.profiling) {
// Start profiling
printf("\nProfiling...\n");
passed = run_fp4_gemv_device(options);
if (passed == false) {
printf("profiling fail\n");
return 1;
} else {
printf("profiling completed\n");
}
}
return 0;
#else
std::cerr << "Unsupported example. Please ensure CUTLASS_ARCH_MMA_SM100_SUPPORTED is defined.\n";
return 0;
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
}

View File

@ -0,0 +1,36 @@
# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (NOT MSVC)
cutlass_example_add_executable(
91_fp4_gemv
91_fp4_gemv.cu
)
endif()

View File

@ -163,7 +163,12 @@ foreach(EXAMPLE
82_blackwell_distributed_gemm
83_blackwell_sparse_gemm
84_blackwell_narrow_precision_sparse_gemm
86_blackwell_mixed_dtype_gemm
87_blackwell_geforce_gemm_blockwise
88_hopper_fmha
89_sm103_fp4_ultra_gemm
90_sm103_fp4_ultra_grouped_gemm
91_fp4_gemv
)
add_subdirectory(${EXAMPLE})

View File

@ -1,5 +1,14 @@
# CUTLASS - Programming Examples
> [!IMPORTANT]
> ### ⚠️ **Not for Benchmarking!** ⚠️
>
> These examples are designed **solely for demonstrating CUTLASS functionality** and may **NOT optimized for performance benchmarking**.
>
> **For accurate performance measurements**, please use the **[CUTLASS Profiler](../tools/profiler/)** instead (recommended) or manually auto-tune the example, if unavailable via the profiler.
>
* [00_basic_gemm](00_basic_gemm/)
launches a basic GEMM with single precision inputs and outputs

View File

@ -39,14 +39,13 @@
*/
#pragma once
#include "cutlass/cutlass.h"
#include <iostream>
#include <cuda/atomic>
#include <cuda/std/atomic>
#include CUDA_STD_HEADER(atomic)
#include "cute/layout.hpp"
#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/cuda_host_adapter.hpp"

View File

@ -452,8 +452,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -528,8 +528,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -567,8 +567,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -575,6 +575,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -681,8 +681,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -69,7 +69,7 @@
"import numpy as np\n",
"import random\n",
"\n",
"import cutlass\n",
"import cutlass_cppgen\n",
"\n",
"# This controls whether the C++ GEMM declaration will be printed at each step. \n",
"# Set to `False` to omit this information.\n",
@ -106,7 +106,7 @@
"metadata": {},
"source": [
"## Declaring and running a GEMM\n",
"To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
"To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Gemm` call.\n",
"This sets up a default GEMM operation for the given device on which you are running.\n",
"\n",
"Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
@ -123,7 +123,7 @@
"source": [
"# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
"# specifying `element_accumulator` is not required if it is the same as `element`\n",
"plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
"plan = cutlass_cppgen.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=np.float32)\n",
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
]
},
@ -133,7 +133,7 @@
"id": "4a5856de",
"metadata": {},
"source": [
"There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
"There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor."
]
},
{
@ -172,7 +172,7 @@
"metadata": {},
"source": [
"## Changing operation modes\n",
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass_cppgen.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
"\n",
"The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
]
@ -197,7 +197,7 @@
"\n",
"As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
"\n",
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
]
},
{
@ -208,7 +208,7 @@
"outputs": [],
"source": [
"tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
"plan.opclass = cutlass.OpcodeClass.Simt\n",
"plan.opclass = cutlass_cppgen.OpcodeClass.Simt\n",
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
]
},
@ -262,7 +262,7 @@
"alpha = np.float16(1.)\n",
"beta = np.float16(2.)\n",
"\n",
"plan.opclass = cutlass.OpcodeClass.TensorOp\n",
"plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp\n",
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
]
},
@ -336,13 +336,13 @@
"# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
"# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
"if plan.cc < 90:\n",
" plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
" plan.swizzling_functor = cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK\n",
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
"else:\n",
" # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
" td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
" td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
" td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
" td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative\n",
" td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
" td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK\n",
"\n",
" plan.compile(td)\n",
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
@ -391,12 +391,12 @@
"metadata": {},
"outputs": [],
"source": [
"from cutlass.backend.utils.device import device_cc\n",
"from cutlass_cppgen.backend.utils.device import device_cc\n",
"\n",
"# 3xTF32 requires SM80 or higher\n",
"if device_cc() >= 80:\n",
" plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
" plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
" plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
" plan.math_operation = cutlass_cppgen.MathOperation.multiply_add_fast_f32\n",
"\n",
" # Create input/output tensors in FP32\n",
" A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
@ -433,9 +433,9 @@
"\n",
"# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
"if device_cc() >= 90:\n",
" plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
" layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
" layout_C=cutlass.LayoutType.ColumnMajor)\n",
" plan = cutlass_cppgen.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
" layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor,\n",
" layout_C=cutlass_cppgen.LayoutType.ColumnMajor)\n",
"\n",
" # Create input/output tensors in FP8\n",
" A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",

View File

@ -68,7 +68,7 @@
"source": [
"import numpy as np\n",
"\n",
"import cutlass\n",
"import cutlass_cppgen\n",
"\n",
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
"# omit this information.\n",
@ -112,7 +112,7 @@
"metadata": {},
"outputs": [],
"source": [
"plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
"plan = cutlass_cppgen.op.Gemm(element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
]
},

View File

@ -75,7 +75,7 @@
"\n",
"## Declaring a grouped GEMM via the CUTLASS Python interface\n",
"A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
"simply calls `cutlass.op.GroupedGemm`."
"simply calls `cutlass_cppgen.op.GroupedGemm`."
]
},
{
@ -85,11 +85,11 @@
"metadata": {},
"outputs": [],
"source": [
"import cutlass\n",
"import cutlass_cppgen\n",
"import torch\n",
"\n",
"dtype = torch.float16\n",
"plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
"plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)"
]
},
{
@ -174,7 +174,7 @@
"outputs": [],
"source": [
"op = plan.construct()\n",
"grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
"grouped_gemm = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
]
},
{
@ -182,7 +182,7 @@
"id": "c8ca3991",
"metadata": {},
"source": [
"The `cutlass.emit.pytorch` function emits:\n",
"The `cutlass_cppgen.emit.pytorch` function emits:\n",
"* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
"* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
"* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",

View File

@ -62,7 +62,7 @@
"import torch\n",
"import random\n",
"\n",
"import cutlass\n",
"import cutlass_cppgen\n",
"\n",
"# This controls whether the C++ GEMM declaration will be printed at each step. \n",
"# Set to `false` to omit this information.\n",
@ -80,7 +80,7 @@
"dilation = (1, 1)\n",
"\n",
"# Compute the output size [N, P, Q, K]\n",
"N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
"N, P, Q, K = cutlass_cppgen.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
"\n",
"dtype = torch.float16\n",
"type_A = torch.float16\n",
@ -111,7 +111,7 @@
"source": [
"## Declaring and running a Conv2d Fprop\n",
"\n",
"We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
"We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
"\n",
"Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
"\n",
@ -125,7 +125,7 @@
"outputs": [],
"source": [
"# Specifying `element_accumulator` is not required if it is the same as `element`\n",
"plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
"plan = cutlass_cppgen.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
"plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
]
},
@ -133,7 +133,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
"There are many other ways to construct a plan from `cutlass_cppgen.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass_cppgen.op.Conv2dFprop` constructor.\n",
"\n",
"We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
]
@ -200,7 +200,7 @@
"metadata": {},
"outputs": [],
"source": [
"plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
"plan_dgrad = cutlass_cppgen.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
"plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
"\n",
"grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
@ -225,7 +225,7 @@
"metadata": {},
"outputs": [],
"source": [
"plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
"plan_wgrad = cutlass_cppgen.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
"plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
"\n",
"grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",

View File

@ -67,17 +67,17 @@
"outputs": [],
"source": [
"import torch\n",
"import cutlass\n",
"from cutlass.epilogue import relu\n",
"from cutlass import Tensor as FakeTensor\n",
"from cutlass.utils.profiler import CUDAEventProfiler\n",
"import cutlass_cppgen\n",
"from cutlass_cppgen.epilogue import relu\n",
"from cutlass_cppgen import Tensor as FakeTensor\n",
"from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n",
"\n",
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
"# omit this information.\n",
"print_module = True\n",
"\n",
"# The Epilogue Visitor feature currently only works for SM80 and 90\n",
"from cutlass.backend.utils.device import device_cc\n",
"from cutlass_cppgen.backend.utils.device import device_cc\n",
"if device_cc() not in [80, 90]:\n",
" import sys\n",
" sys.exit()\n",
@ -99,7 +99,7 @@
"tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
"tensor_D = torch.zeros_like(tensor_C)\n",
"\n",
"plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
"plan = cutlass_cppgen.op.Gemm(element=torch.float16, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=torch.float32)"
]
},
{
@ -115,7 +115,7 @@
"\n",
"The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
"\n",
"The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
"The epilogue can be generated simply through `cutlass_cppgen.evt.trace(<epilogue function>, <example_tensors>)`."
]
},
{
@ -139,7 +139,7 @@
"bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
"tensor_F = torch.zeros_like(tensor_D)\n",
"examples_tensors = {\n",
" \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
" \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass_cppgen.LayoutType.RowMajor),\n",
" \"alpha\": alpha,\n",
" \"C\": tensor_C,\n",
" \"beta\": beta,\n",
@ -150,7 +150,7 @@
"}\n",
"\n",
"# Trace the epilogue visitor\n",
"epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
"epilogue_visitor = cutlass_cppgen.epilogue.trace(example_epilogue, examples_tensors)"
]
},
{