CUTLASS 3.8 Release (#2059)

* CUTLASS 3.8 Release

* update

* Update README.md

* Revert "Update README.md"

This reverts commit b353e36fe8.

* update

* update

---------

Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
mihir-awatramani
2025-01-24 23:44:06 -08:00
committed by GitHub
parent 9eb01fa0b0
commit 389e493055
290 changed files with 91223 additions and 292 deletions

View File

@ -118,6 +118,7 @@ void FilterArchitecture() {
{ "SM80*", 80, kMaxDevice},
{ "SM89*", 89, 89},
{ "SM90*", 90, 90},
{ "SM100*", 100, 100},
{ 0, 0, false }
};

View File

@ -679,6 +679,11 @@ struct GetName<cutlass::float_e4m3_t> {
static constexpr char name[] = "float_e4m3_t";
};
template <>
struct GetName<cutlass::float_e5m2_t> {
static constexpr char name[] = "float_e5m2_t";
};
template <>
struct GetName<cutlass::half_t> {
static constexpr char name[] = "half_t";
@ -724,13 +729,20 @@ using VectorConvertTypes = ::testing::Types<
ResultSourcePair<cutlass::bfloat16_t, uint8_t>,
ResultSourcePair<cutlass::bfloat16_t, int8_t>,
ResultSourcePair<cutlass::float_e4m3_t, cutlass::int2b_t>,
ResultSourcePair<cutlass::float_e5m2_t, cutlass::int2b_t>,
ResultSourcePair<cutlass::half_t, cutlass::int2b_t>,
ResultSourcePair<cutlass::bfloat16_t, cutlass::int2b_t>,
ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint2b_t>,
ResultSourcePair<cutlass::float_e5m2_t, cutlass::uint2b_t>,
ResultSourcePair<cutlass::half_t, cutlass::uint2b_t>,
ResultSourcePair<cutlass::bfloat16_t, cutlass::uint2b_t>,
ResultSourcePair<cutlass::float_e4m3_t, cutlass::int4b_t>,
ResultSourcePair<cutlass::float_e5m2_t, cutlass::int4b_t>,
ResultSourcePair<cutlass::half_t, cutlass::int4b_t>,
ResultSourcePair<cutlass::bfloat16_t, cutlass::int4b_t>,
ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint4b_t>,
ResultSourcePair<cutlass::half_t, cutlass::uint4b_t>,
ResultSourcePair<cutlass::bfloat16_t, cutlass::uint4b_t>,
ResultSourcePair<float, cutlass::int4b_t>

View File

@ -29,6 +29,10 @@
add_custom_target(cutlass_test_unit_gemm_device)
add_custom_target(test_unit_gemm_device)
add_subdirectory(sm100_blockscaled_tensorop_gemm)
################################################################################
function(cutlass_test_unit_gemm_device_add_deps NAME)
@ -433,12 +437,12 @@ cutlass_test_unit_gemm_device_add_executable(
gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
sm80_gemm_f64_f64_f64_tensor_op_f64.cu
# SM90 device level tests
gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
sm80_gemm_f64_f64_f64_tensor_op_f64.cu
gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
@ -821,3 +825,147 @@ if (CUTLASS_NVCC_DEVICE_COMPILE)
endif()
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_sm100_fp16_gemm
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_gemm_f16_f16_f32_tensor_op_f32.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_stream_k
sm100_gemm_f16_f16_f16_tensor_op_f32_stream_k.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_sm100_bf16_gemm
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_stride_batch_alpha_beta_sm100
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_gemm_f8_f8_f8_tensor_op_s32_batch_alpha_beta.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_runtime_datatype_sm100
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_gemm_f8_f8_f8_tensor_op_f32_runtime_datatype.cu
sm100_gemm_f6_f6_f32_tensor_op_f32_runtime_datatype.cu
sm100_gemm_f4_f4_f32_tensor_op_f32_runtime_datatype.cu
sm100_gemm_f8_f4_f32_tensor_op_f32_runtime_datatype.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array
# 14 (9 + 5) unit tests
sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_16b_tensorop_sm100_group_gemm
sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_16b_mixed_tensorop_sm100_ptr_array
# 14 (9 + 5) unit tests
sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_32b_tensorop_sm100_ptr_array
# 10 unit tests
sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_32b_tensorop_sm100_group_gemm
# 10 unit tests
sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_8b_tensorop_sm100_ptr_array
# 12 unit tests
sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_8b_tensorop_sm100_group_gemm
# 8 unit tests
sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_mxf8_training_sm100_group_gemm
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_mxf4xmxf8_sm100_group_gemm
# 8 unit tests
sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_ptr_array
# 8 unit tests
sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_group_gemm_1
# 8 unit tests
sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_blockscaled_gemm_device_fp6_tensorop_sm100_ptr_array
# 8 unit tests
sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
)
endif()

File diff suppressed because it is too large Load Diff

View File

@ -111,6 +111,18 @@ struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutpu
using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
};
template <typename Gemm, typename = void>
struct IsF8F6F4Kernel {
static constexpr bool value = false;
};
template <typename Gemm>
struct IsF8F6F4Kernel<Gemm, std::void_t<decltype(Gemm::GemmKernel::CollectiveMainloop::IsF8F6F4)>> {
static constexpr bool value = true;
};
// The maximum swizzle size to use
//
// This class, like Splits above makes it harder to confuse
@ -212,9 +224,26 @@ bool initialize_tensor(
scope_max = 2;
scope_min = 0;
}
else if (bits_input <= 6) {
scope_max = 2;
scope_min = -2;
}
else if (bits_input <= 8) {
if constexpr (
cute::is_same_v<Element, cutlass::float_ue8m0_t>){
scope_max = 4;
scope_min = 1;
}
else {
scope_max = 1;
scope_min = -1;
}
}
else{
scope_max = 4;
@ -487,6 +516,277 @@ struct HostCollectiveMainloop {
}
};
//
// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
//
template<
class Gemm,
int SchedulerPipelineStageCount_,
int AccumulatorPipelineStageCount_,
class ElementA_,
class ElementB_
>
struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<
SchedulerPipelineStageCount_,
AccumulatorPipelineStageCount_>,
Gemm, ElementA_, ElementB_> {
// Kernel data types
using ElementA = ElementA_;
using StrideA = typename Gemm::GemmKernel::StrideA;
using InternalStrideA = typename Gemm::GemmKernel::InternalStrideA;
using ElementB = ElementB_;
using StrideB = typename Gemm::GemmKernel::StrideB;
using InternalStrideB = typename Gemm::GemmKernel::InternalStrideB;
using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
using ElementScalingFactor = ElementAccumulator;
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
using ElementSF = typename Gemm::GemmKernel::ElementSF;
using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
using Blk_MN = typename Sm100BlkScaledConfig::Blk_MN;
using Blk_SF = typename Sm100BlkScaledConfig::Blk_SF;
using SfAtom = typename Sm100BlkScaledConfig::SfAtom;
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
using InternalLayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
using InternalLayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
using Arguments = typename Gemm::GemmKernel::MainloopArguments;
// Whether to use relative equality checks
CheckEquality check_relative_equality = CheckEquality::EXACT;
std::vector<InternalStrideA> stride_a_host;
std::vector<InternalStrideB> stride_b_host;
cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
std::vector<InternalLayoutSFA> layout_sfa_host;
std::vector<InternalLayoutSFB> layout_sfb_host;
cutlass::DeviceAllocation<InternalLayoutSFA> layout_sfa_device;
cutlass::DeviceAllocation<InternalLayoutSFB> layout_sfb_device;
typename LayoutTagA::Stride stride_factor_A;
typename LayoutTagB::Stride stride_factor_B;
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
std::vector<cutlass::HostTensor<ElementSF, LayoutTagA>> tensors_SFA;
std::vector<cutlass::HostTensor<ElementSF, LayoutTagB>> tensors_SFB;
cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFA;
cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFB;
uint64_t seed;
static constexpr uint64_t kDefaultSeed = 4096;
// Note: this limitation comes from testbed / not the library
static_assert(is_row_or_col_major<InternalStrideA>(),
"ERROR : A Layout is neither Row / Column Major)");
static_assert(is_row_or_col_major<InternalStrideB>(),
"ERROR : B Layout is neither Row / Column Major)");
HostCollectiveMainloop(
CheckEquality check_relative_equality_ = CheckEquality::EXACT,
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
uint64_t seed_ = kDefaultSeed,
typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
):
check_relative_equality(check_relative_equality_),
stride_factor_A(stride_factor_A_),
stride_factor_B(stride_factor_B_),
init_A(init_A_), init_B(init_B_), seed(seed_) { }
template<class ProblemShapeType>
bool initialize(ProblemShapeType problem_shapes) {
//
// Allocate the GEMM workspace
//
tensors_A.clear();
tensors_B.clear();
stride_a_host.clear();
stride_b_host.clear();
tensors_SFA.clear();
tensors_SFB.clear();
layout_sfa_host.clear();
layout_sfb_host.clear();
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
L = std::max(problem_shapes.groups(), L);
for (int32_t i = 0; i < L; ++i) {
auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
// 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
auto a_coord = cutlass::make_Coord(M, K);
// Cutlass has Row/Col major refers to MxK times KxN matrix product,
// so the HostTensorB should be treated as KxN in "coord"'s view
auto b_coord = cutlass::make_Coord(K, N);
tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
// It is possible to randomly initialize to all zeros, so override this with non-zeros
// in the upper left corner of each operand.
tensors_A[i].host_view().at({0, 0}) = ElementA(1);
tensors_B[i].host_view().at({0, 0}) = ElementB(1);
tensors_A[i].sync_device();
tensors_B[i].sync_device();
using namespace cute;
auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
auto m_blks = cutlass::ceil_div(M, Blk_MN{});
auto n_blks = cutlass::ceil_div(N, Blk_MN{});
layout_sfa_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
layout_sfb_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
// 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
auto sfa_coord = cutlass::make_Coord(m_blks * Blk_MN{}, k_blks * Blk_SF{});
auto sfb_coord = cutlass::make_Coord(n_blks * Blk_MN{}, k_blks * Blk_SF{});
tensors_SFA.push_back(cutlass::HostTensor<ElementSF, LayoutTagA>(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A)));
tensors_SFB.push_back(cutlass::HostTensor<ElementSF, LayoutTagB>(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B)));
EXPECT_TRUE(initialize_tensor(tensors_SFA[i].host_view(), init_A, seed + 2024 + i));
EXPECT_TRUE(initialize_tensor(tensors_SFB[i].host_view(), init_B, seed + 2025 + i));
// It is possible to randomly initialize to all zeros, so override this with non-zeros
// in the upper left corner of each operand.
tensors_SFA[i].host_view().at({0, 0}) = ElementSF(1);
tensors_SFB[i].host_view().at({0, 0}) = ElementSF(1);
tensors_SFA[i].sync_device();
tensors_SFB[i].sync_device();
}
return true;
}
Arguments to_args(ProblemShapeType problem_shapes) {
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
L = std::max(problem_shapes.groups(), L);
std::vector<ElementA *> ptr_A_host(L);
std::vector<ElementB *> ptr_B_host(L);
std::vector<ElementSF *> ptr_SFA_host(L);
std::vector<ElementSF *> ptr_SFB_host(L);
for (int32_t i = 0; i < L; ++i) {
ptr_A_host.at(i) = tensors_A[i].device_data();
ptr_B_host.at(i) = tensors_B[i].device_data();
ptr_SFA_host.at(i) = tensors_SFA[i].device_data();
ptr_SFB_host.at(i) = tensors_SFB[i].device_data();
}
device_tensors_A.reset(L);
device_tensors_A.copy_from_host(ptr_A_host.data());
device_tensors_B.reset(L);
device_tensors_B.copy_from_host(ptr_B_host.data());
device_tensors_SFA.reset(L);
device_tensors_SFA.copy_from_host(ptr_SFA_host.data());
device_tensors_SFB.reset(L);
device_tensors_SFB.copy_from_host(ptr_SFB_host.data());
stride_a_device.reset(problem_shapes.groups());
stride_a_device.copy_from_host(stride_a_host.data());
stride_b_device.reset(problem_shapes.groups());
stride_b_device.copy_from_host(stride_b_host.data());
layout_sfa_device.reset(problem_shapes.groups());
layout_sfa_device.copy_from_host(layout_sfa_host.data());
layout_sfb_device.reset(problem_shapes.groups());
layout_sfb_device.copy_from_host(layout_sfb_host.data());
if constexpr (IsGroupGemm) {
return Arguments{
device_tensors_A.get(), stride_a_device.get(),
device_tensors_B.get(), stride_b_device.get(),
device_tensors_SFA.get(), layout_sfa_device.get(),
device_tensors_SFB.get(), layout_sfb_device.get()
};
}
else {
return Arguments{
device_tensors_A.get(), stride_a_host[0],
device_tensors_B.get(), stride_b_host[0],
device_tensors_SFA.get(), layout_sfa_host[0],
device_tensors_SFB.get(), layout_sfb_host[0]
};
}
}
auto to_host_args(ProblemShapeType problem_shapes, int batch) {
using namespace cute;
//
// Allocate the GEMM workspace
//
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
make_layout(make_shape(M, K, 1), stride_a_host[batch]));
auto SfA = make_tensor(tensors_SFA[batch].host_data(), layout_sfa_host[batch]);
auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
make_layout(make_shape(N, K, 1), stride_b_host[batch]));
auto SfB = make_tensor(tensors_SFB[batch].host_data(), layout_sfb_host[batch]);
return cutlass::reference::host::GettMainloopParams<ElementAccumulator,
decltype(A),
decltype(B),
decltype(SfA),
decltype(SfB)
>
{A, SfA, B, SfB};
}
void print_tensors(std::ofstream& file, int batch) {
file << "A =\n" << tensors_A[batch].host_view()
<< "\nB =\n" << tensors_B[batch].host_view()
<< "\nSFA =\n" << tensors_SFA[batch].host_view()
<< "\nSFB =\n" << tensors_SFB[batch].host_view();
}
bool compare_reference(
ProblemShapeType problem_shapes, int batch) {
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFA[batch].host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFB[batch].host_view()), 0);
return true;
}
};
template<class Gemm>
struct HostCollectiveDefaultEpilogue {
// fusion types are potentially void if the fusion is not supported
@ -803,6 +1103,24 @@ struct HostCollectiveEpilogue {
using FusionOp = typename Gemm::EpilogueOutputOp;
static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
// Scale factor Generation related
using SfStrategy = cutlass::reference::host::SfStrategy;
static constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
static constexpr SfStrategy SfGenStrategy = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
using ElementSFD = non_void_t<cute::remove_pointer_t<typename FusionOp::ElementBlockScaleFactor>, ElementD>;
using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
SFD_VectorSize
>;
using Blk_MN = typename Sm100BlockScaledOutputConfig::Blk_MN;
using Blk_SF = typename Sm100BlockScaledOutputConfig::Blk_SF;
using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> tensors_SFD;
std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> references_SFD;
cutlass::DeviceAllocation<ElementSFD *> device_tensors_SFD;
using ElementCompute = typename FusionOp::ElementCompute;
using ElementScalar = typename FusionOp::ElementScalar;
using ElementBias = non_void_t<typename FusionOp::ElementBias>;
@ -904,6 +1222,11 @@ struct HostCollectiveEpilogue {
references_D.clear();
stride_c_host.clear();
stride_d_host.clear();
tensors_SFD.clear();
references_SFD.clear();
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
L = std::max(problem_shapes.groups(), L);
@ -1034,6 +1357,26 @@ struct HostCollectiveEpilogue {
}
}
if constexpr (IsBlockScaleSupported) {
for (int32_t i = 0; i < L; ++i) {
auto [M, N, K, _] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
// If block scaled output is supported we always have at least 1 SFD
auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
auto sfd_coord = [&] () {
return cutlass::make_Coord(m_blks * Blk_MN{}, n_blks * Blk_SF{});
}();
tensors_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D)));
references_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false));
tensors_SFD[i].sync_device();
}
norm_constant.resize(scalar_coord, true);
EXPECT_TRUE(initialize_tensor(norm_constant.host_view(), init_scale, seed + 2023));
norm_constant.sync_device();
}
return true;
}
@ -1116,6 +1459,17 @@ struct HostCollectiveEpilogue {
passed &= tmp;
}
}
if constexpr (IsBlockScaleSupported) {
tensors_SFD[batch].sync_host();
bool passed_sf = equality_check(references_SFD[batch].host_view(), tensors_SFD[batch].host_view());
if(!passed_sf) {
std::cout<<"SF is incorrect"<<std::endl;
}
passed &= passed_sf;
}
return passed;
}
@ -1308,6 +1662,19 @@ struct HostCollectiveEpilogue {
fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
}
}
if constexpr (IsBlockScaleSupported) {
std::vector<ElementSFD *> ptr_SFD_host(L);
for (int32_t i = 0; i < L; ++i) {
ptr_SFD_host.at(i) = tensors_SFD[i].device_data();
}
device_tensors_SFD.reset(L);
device_tensors_SFD.copy_from_host(ptr_SFD_host.data());
arguments.thread.block_scale_factor_ptr = device_tensors_SFD.get();
arguments.thread.norm_constant_ptr = norm_constant.device_data();
}
}
return arguments;
@ -1341,6 +1708,20 @@ struct HostCollectiveEpilogue {
cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, M)));
auto Vbeta = cute::make_tensor(detail::make_iterator(beta.host_data()),
cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, N)));
auto SfD = [&](){
if constexpr (IsBlockScaleSupported) {
auto tensor = make_tensor(detail::make_iterator(references_SFD[batch].host_data()),
Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
return tensor;
}
else {
// Reference kernel has a logic to ignore scalefactor computation if we pass the tensor type same as output D tensor.
return D;
}
}();
cutlass::reference::host::GettEpilogueParams<
ElementScalar,
ElementScalar,
@ -1353,8 +1734,11 @@ struct HostCollectiveEpilogue {
decltype(Valpha),
decltype(Vbeta),
ActivationFunctor
, decltype(SfD)
, Int<SFD_VectorSize>
, cutlass::plus<ElementCompute>
, false
, SfGenStrategy
> epilogue_params{};
epilogue_params.C = C;
@ -1397,6 +1781,12 @@ struct HostCollectiveEpilogue {
epilogue_params.Vbeta = Vbeta;
}
}
if constexpr (IsBlockScaleSupported) {
epilogue_params.SfD = SfD;
epilogue_params.st = norm_constant.at(coord_0);
}
return epilogue_params;
}
};
@ -1812,8 +2202,24 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
using ElementB = typename Gemm::GemmKernel::ElementB;
using TiledMma = typename Gemm::GemmKernel::TiledMma;
int alignment_bits = 128;
static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
// For fp4 and fp6 mx kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);
if constexpr (apply_alignment_offset) {
// If BlockScaled, then min alignment is SFVecSize
static constexpr bool IsBlockScaleSupported = Gemm::EpilogueOutputOp::IsBlockScaleSupported;
static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
if constexpr (IsBlockScaleSupported) {
alignment_input = cutlass::round_up(alignment_input, SFVecSize);
}
}
using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
CtaShape_MNK cta_shape;

View File

@ -258,6 +258,12 @@ struct Testbed3xTensorBroadcast {
cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
auto dummy_SFD = cute::make_tensor(static_cast<ElementD*>(nullptr),
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
using DummySFDVectorSize = cute::Int<0>;
cutlass::reference::host::GettEpilogueParams<
ElementScalar,
ElementScalar,
@ -270,6 +276,8 @@ struct Testbed3xTensorBroadcast {
decltype(dummy_Valpha),
decltype(dummy_Vbeta),
ActivationFunctor,
decltype(dummy_SFD),
DummySFDVectorSize,
cutlass::plus<ElementCompute>,
PerColBias> epilogue_params{
alpha,

View File

@ -0,0 +1,150 @@
# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
add_custom_target(
cutlass_test_unit_gemm_device_sm100_blockscaled
DEPENDS
cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
BATCH_SOURCES ON
BATCH_SIZE 1
nvf4_nvf4_bf16_bf16.cu
nvf4_nvf4_bf16_bf16_features.cu
nvf4_nvf4_f16_nvfp4_epilogue.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
BATCH_SOURCES ON
BATCH_SIZE 1
mxf4_mxf4_void_f16_tn_layout.cu
mxf4_mxf4_void_f16_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
BATCH_SOURCES ON
BATCH_SIZE 1
mxf6_mxf6_void_bf16_tn_layout.cu
mxf6_mxf6_void_bf16_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
BATCH_SOURCES ON
BATCH_SIZE 1
mxf8_mxf8_void_f8_tn_layout.cu
mxf8_mxf8_void_f8_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
BATCH_SOURCES ON
BATCH_SIZE 1
mxf6_mxf8_void_f32_tn_layout.cu
mxf6_mxf8_void_f32_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
BATCH_SOURCES ON
BATCH_SIZE 1
mxf8_mxf6_f16_f8_tn_layout.cu
mxf8_mxf6_f16_f8_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
BATCH_SOURCES ON
BATCH_SIZE 1
mxf4_mxf8_bf16_bf16_tn_layout.cu
mxf4_mxf8_bf16_bf16_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
BATCH_SOURCES ON
BATCH_SIZE 1
mxf8_mxf4_f16_bf16_tn_layout.cu
mxf8_mxf4_f16_bf16_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
BATCH_SOURCES ON
BATCH_SIZE 1
mxf6_mxf4_f16_f16_tn_layout.cu
mxf6_mxf4_f16_f16_nt_layout.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
BATCH_SOURCES ON
BATCH_SIZE 1
mxf4_mxf6_f32_f16_tn_layout.cu
mxf4_mxf6_f32_f16_nt_layout.cu
)
endif()

View File

@ -0,0 +1,303 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|--------|---------------|----|----|-------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x128x256_1x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256x256x256_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,523 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|--------|---------------|-------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x192x256_1x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x192x256_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x256x256_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,304 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|--------|---------------|----|----|-------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = float;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|--------|---------------|-------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = float;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x192x128_2x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = float;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = float;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = float;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* Mma Tile Shapes supported:
For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|--------|---------------|----|----|--------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 128x192x128_1x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 16 elements
* Mma Tile Shapes supported:
For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,304 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|--------|---------------|----|----|-------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|--------|---------------|-------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x256x128_1x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::half_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,304 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|--------|---------------|----|----|-------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported depends on the layout for mxfp6 mixed precision GEMM
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|--------|---------------|-------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,523 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* Mma Tile Shapes supported:
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|--------|---------------|----|----|--------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x128x128_4x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 16 elements
* Mma Tile Shapes supported:
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignA = 128;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 4;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = float;
constexpr int AlignD = 4;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,304 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 16 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported:
For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|--------|---------------|----|----|--------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,523 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* B tensor:
* Types: {e2m1}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported:
For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x256x128_4x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x192x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,304 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 16 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Row Major (T)
* Alignment: 128 elements
* Mma Tile Shapes supported:
For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|--------|---------------|----|----|--------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x128x128_1x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* B tensor:
* Types: {e2m3,e3m2}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* Mma Tile Shapes supported:
For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | N | N | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | N | N | Y |
| 2SM | 256x192x128 | Y | N | N | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
constexpr int AlignB = 128;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e4m3_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,523 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 128 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* Mma Tile Shapes supported:
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|--------|---------------|----|----|--------|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x128x128_1x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x192x128_1x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::RowMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,524 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
* A tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Row Major (T)
* Alignment: 16 elements
* B tensor:
* Types: {e5m2,e4m3}xue8m0
* Layout: Column Major (N)
* Alignment: 16 elements
* Mma Tile Shapes supported:
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x128 | Y | Y | Y | Y |
| 1SM | 128x192x128 | Y | Y | Y | Y |
| 1SM | 128x256x128 | Y | Y | Y | Y |
| 2SM | 256x128x128 | Y | Y | Y | Y |
| 2SM | 256x192x128 | Y | Y | Y | Y |
| 2SM | 256x256x128 | Y | Y | Y | Y |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
constexpr int AlignA = 16;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
constexpr int AlignB = 16;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = void;
constexpr int AlignC = 16;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e5m2_t;
constexpr int AlignD = 16;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,683 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for nvfp4xnvfp4 Block Scaled Gemm
* A tensor:
* Types: {e2m1}xue4m3
* Layout: Row Major (T)
* Alignment: 32 elements
* B tensor:
* Types: {e2m1}xue4m3
* Layout: Column Major (N)
* Alignment: 32 elements
* Mma Tile Shapes supported:
Support Matrix (Y: Yes, N: No)
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|--------|---------------|--------|----|----|----|
| 1SM | 128x128x256 | Y | N | N | N |
| 1SM | 128x192x256 | Y | N | N | N |
| 1SM | 128x256x256 | Y | N | N | N |
| 2SM | 256x128x256 | Y | N | N | N |
| 2SM | 256x192x256 | Y | N | N | N |
| 2SM | 256x256x256 | Y | N | N | N |
(*) Unit tests in this file
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
///////////////////////////////////////////////////////////////////////////////
//
// Using targeted scheduling with **static** cluster shapes
//
///////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_2x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
//////////////////////////////////////////////////////////////////////////////
//
// Using large Cta Tiles: N=192 and N=256
//
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x192x256_2x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x256x256_2x1x1_1sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x192x256_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x256x256_2x4x1_2sm_auto) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,374 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Runtime data type for blockscaled gemm fp4
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////
//
// Using Runtime Types
//
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x2x1_1sm_auto_runtime_dtypes) {
// Describe A and B tensors
using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto_runtime_dtypes) {
// Describe A and B tensors
using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
// Check results
EXPECT_TRUE(pass);
}
//////////////////////////////////////////////////////////////////////////////
//
// Using Stream-K Scheduler
//
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_1x4x1_1sm_auto_streamK) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
TileScheduler // Specify the streamK scheduler for the kernel
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto_streamK) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::bfloat16_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::bfloat16_t;
constexpr int AlignD = 8;
using GmemLayoutD = cutlass::layout::RowMajor;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Tile and cluster shapes
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
TileScheduler // Specify the streamK scheduler for the kernel
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,436 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit test for nvfp4 Block Scaled Gemm with nvfp4 output
D tensor:
* Types: e2m1x{ue4m3}
* Layout: Column Major (T)
* Alignment: 32
* Scale factors need to be generated with the fp4 output. It is generated along the continuous dimensions of the D tensor.
* Meanwhile, before scale factor generation, it could have other epilogue fusion operation.
* alpha
* beta
* activation
* bias
This UT tests
- alpha + beta + scale-factor generation
- alpha + beta + bias + scale-factor generation
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../../common/cutlass_unit_test.h"
#include "../gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
// with alpha/beta fusion
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstensorop_1sm_f32, 128x128x256_4x4x1) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e2m1_t;
constexpr int AlignD = 32;
using GmemLayoutD = cutlass::layout::RowMajor;
// Describe SFD tensor
using ElementSFD = cutlass::float_ue4m3_t;
using GmemLayoutSFD = GmemLayoutD;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
//
// Construct FusionOperation
//
constexpr int SFDVectorSize = 16;
// Define the fusion operation applied during epilogue
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
SFDVectorSize,
ElementD, ElementCompute,
ElementSFD, GmemLayoutSFD,
ElementC
>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstensorop_2sm_f32, 256x128x256_4x4x1) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e2m1_t;
constexpr int AlignD = 32;
using GmemLayoutD = cutlass::layout::RowMajor;
// Describe SFD tensor
using ElementSFD = cutlass::float_ue4m3_t;
using GmemLayoutSFD = GmemLayoutD;
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
//
// Construct FusionOperation
//
constexpr int SFDVectorSize = 16;
// Define the fusion operation applied during epilogue
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
SFDVectorSize,
ElementD, ElementCompute,
ElementSFD, GmemLayoutSFD,
ElementC
>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
//////////////////////////////////////////////////////////////////////////////
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 32)
// with alpha/beta fusion
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstensorop_1sm_f32, 128x128x256_4x4x1) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e2m1_t;
constexpr int AlignD = 32;
using GmemLayoutD = cutlass::layout::RowMajor;
// Describe SFD tensor
using ElementSFD = cutlass::float_ue4m3_t;
using GmemLayoutSFD = GmemLayoutD;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct FusionOperation
//
constexpr int SFDVectorSize = 32;
// Define the fusion operation applied during epilogue
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
SFDVectorSize,
ElementD, ElementCompute,
ElementSFD, GmemLayoutSFD,
ElementC
>;
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
ElementAccumulator, // Mma instruction accumulator type
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
>::CollectiveOp;
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
// Run tests
auto pass = test::gemm::device::TestAll<Gemm>();
// Check results
EXPECT_TRUE(pass);
}
//////////////////////////////////////////////////////////////////////////////
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
// with alpha+beta+relu+bias fusion
//////////////////////////////////////////////////////////////////////////////
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstensorop_1sm_f32_bias_relu, 128x128x256_4x4x1) {
// Describe A and B tensors
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignA = 32;
using GmemLayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
constexpr int AlignB = 32;
using GmemLayoutB = cutlass::layout::ColumnMajor;
// Describe C and D tensors
using ElementC = cutlass::half_t;
constexpr int AlignC = 8;
using GmemLayoutC = cutlass::layout::RowMajor;
using ElementD = cutlass::float_e2m1_t;
constexpr int AlignD = 32;
using GmemLayoutD = cutlass::layout::RowMajor;
// Describe SFD tensor
using ElementSFD = cutlass::float_ue4m3_t;
using GmemLayoutSFD = GmemLayoutD;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
// Bias type
using ElementBias = float;
// Collective MMA takes tile shape of the MMA operation as input
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
// Epilogue computation's precision type
using ElementCompute = float;
constexpr int SFDVectorSize = 32;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasBlockScaleFactor<
SFDVectorSize, ElementD, ElementCompute,
ElementSFD, GmemLayoutSFD,
ElementBias, ElementC
>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
PerSmTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, AlignC,
ElementD, GmemLayoutC, AlignD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
ElementA, GmemLayoutA, AlignA,
ElementB, GmemLayoutB, AlignB,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestAll<Gemm>();
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,364 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Ptr-Array GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_4x1x1) {
// A matrix configuration
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_4,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_bf16t_bf16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_4x4x1) {
// A matrix configuration
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,323 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/// A Row B Col
TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
using ElementA = cutlass::bfloat16_t;
using ElementB = cutlass::bfloat16_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 16,
ElementD, GmemLayoutC, 16,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, GmemLayoutA, 8,
ElementB, GmemLayoutB, 8,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Row
TEST(SM100_Device_Gemm_f16n_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
using ElementA = cutlass::bfloat16_t;
using ElementB = cutlass::bfloat16_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 16,
ElementD, GmemLayoutC, 16,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, GmemLayoutA, 8,
ElementB, GmemLayoutB, 8,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Row B Row
TEST(SM100_Device_Gemm_f16t_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
using ElementA = cutlass::bfloat16_t;
using ElementB = cutlass::bfloat16_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 16,
ElementD, GmemLayoutC, 16,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, GmemLayoutA, 8,
ElementB, GmemLayoutB, 8,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Col
TEST(SM100_Device_Gemm_f16n_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
using ElementA = cutlass::bfloat16_t;
using ElementB = cutlass::bfloat16_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 16,
ElementD, GmemLayoutC, 16,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, GmemLayoutA, 8,
ElementB, GmemLayoutB, 8,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_bf16t_bf16t_bf32_void_f32n_tensor_op, 128x256x64_1x2x1) {
using ElementA = cutlass::bfloat16_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::bfloat16_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementAccumulator = float;
using LayoutC = cutlass::layout::ColumnMajor;
using MmaTileShape = Shape<_128,_128,_64>;
using TileShape_MNK = Shape<_128,_256,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
void, LayoutC, 8,
float, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,364 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Ptr-Array GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,606 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Grouped GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,665 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Ptr-Array GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = cutlass::half_t; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = cutlass::half_t; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,250 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface with stream-K scheduling
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 128x256x64_1x2x1) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementAccumulator = float;
using LayoutC = cutlass::layout::ColumnMajor;
using TileShape_MNK = Shape<_128,_256,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::half_t, LayoutC, 8,
cutlass::half_t, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
cutlass::gemm::StreamKScheduler
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x1x1) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementAccumulator = float;
using LayoutC = cutlass::layout::ColumnMajor;
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::half_t, LayoutC, 8,
cutlass::half_t, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
cutlass::gemm::StreamKScheduler
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x256x64_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::half_t, LayoutC, 8,
cutlass::half_t, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
cutlass::gemm::StreamKScheduler
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
EXPECT_TRUE(result);
}
///////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::half_t, LayoutC, 8,
cutlass::half_t, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
cutlass::gemm::StreamKScheduler
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,104 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_f16t_f16t_f32_void_f16n_tensor_op, 128x256x64_1x2x1) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementAccumulator = float;
using LayoutC = cutlass::layout::ColumnMajor;
using TileShape_MNK = Shape<_128,_256,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
void, LayoutC, 8,
cutlass::half_t, LayoutC, 8,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::half_t, LayoutA, 8,
cutlass::half_t, LayoutB, 8,
float,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,664 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Ptr-Array GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f16n_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = cutlass::half_t; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = cutlass::half_t; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,606 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Grouped GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
ElementD, LayoutD *, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,667 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Ptr-Array GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
using namespace cute;
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 1.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(3.0, 2.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(result);
}
TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
// A matrix configuration
using ElementA = float; // Element type for A matrix operand
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using ElementB = float; // Element type for B matrix operand
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
// C matrix configuration
using ElementC = float; // Element type for C matrix operands
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = float; // Element type for D matrix operands
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
ElementD, LayoutD, AlignmentD,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool result = TestSmall<Gemm>(2.0, 2.0);
EXPECT_TRUE(result);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,327 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_256,_256>;
using ClusterShape = Shape<_4,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementC),
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA *, 32,
MmaTypePairB, LayoutB *, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_384,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
ElementD, LayoutC *, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA *, 32,
MmaTypePairB, LayoutB *, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
ElementD, LayoutC *, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA *, 32,
MmaTypePairB, LayoutB *, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
ElementD, LayoutC *, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA *, 32,
MmaTypePairB, LayoutB *, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
ElementD, LayoutC *, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA *, 32,
MmaTypePairB, LayoutB *, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,327 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_4x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_256,_256>;
using ClusterShape = Shape<_4,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 32,
MmaTypePairB, LayoutB, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_384,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 32,
MmaTypePairB, LayoutB, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 32,
MmaTypePairB, LayoutB, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 32,
MmaTypePairB, LayoutB, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e2m1_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 32,
MmaTypePairB, LayoutB, 32,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,156 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 512x512x128_4x4x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_4,cute::_4,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
float,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_4,cute::_4,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
EXPECT_TRUE(pass);
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,486 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_128,_128,_256>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using ClusterShape = Shape<_2,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_512,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e2m3_t;
using ElementB = cutlass::float_e2m3_t;
using ElementC = float;
using ElementD = float;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
ElementD, LayoutC, 4,
EpilogueSchedule
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, LayoutA, 128,
MmaTypePairB, LayoutB, 128,
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,156 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 512x512x128_4x4x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_4,cute::_4,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_4,cute::_4,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,109 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e4m3t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_1,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
float,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_1,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E2M1);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,504 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide Grouped GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128_1x1x1) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_silu) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
ElementD, LayoutC *, 16 / sizeof(ElementD),
EpilogueSchedule,
cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_voidC_silu) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutD = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
void, LayoutD *, 16 / sizeof(ElementD),
ElementD, LayoutD *, 16 / sizeof(ElementD),
EpilogueSchedule,
cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA *, 16 / sizeof(ElementA),
ElementB, LayoutB *, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,465 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////// 128x128x128 //////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,297 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_1,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::epilogue::fusion::LinearCombination<
cutlass::float_e4m3_t,
float,
cutlass::float_e4m3_t,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
float,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_1,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
cutlass::float_e4m3_t,
float,
cutlass::float_e4m3_t,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e5m2n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
cutlass::float_e4m3_t,
float,
cutlass::float_e4m3_t,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E5M2);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
cutlass::float_e4m3_t,
float,
cutlass::float_e4m3_t,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
float,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E4M3);
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e5m2t_e5m2n_e5m2t_tensorop_2sm_f32_runtime_datatype, 256x256x128_2x2x1) {
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_128, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::epilogue::fusion::LinearCombination<
cutlass::float_e5m2_t,
float,
cutlass::float_e5m2_t,
float
>
>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
float,
cute::Shape<cute::_256, cute::_128, cute::_128>,
cute::Shape<cute::_2,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue,
void>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E5M2);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,230 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////// Test Batch alpha and beta //////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1cta_s32_batch_alpha_beta, 128x64x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
ElementD,
ElementCompute,
ElementC,
ElementBias
>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule,
FusionOperation
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, true, true>(1.0, 1.0); // beta is [1.0, 2.0]
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu_batch_alpha_beta, 128x128x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule,
FusionOperation
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, 0.5); // beta is [0.5, 1.5]
EXPECT_TRUE(pass);
}
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu__batch_alpha_beta0, 128x128x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule,
FusionOperation
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, -1.0); // beta is [-1.0, 0.0]
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,284 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////// 128x64x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementD = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
EXPECT_TRUE(pass);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////// 128x64x128 Cluster4x2x1 TMEM 4x1 ////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementD = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_2,_1>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
EXPECT_TRUE(pass);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////// 64x256x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int32_t;
using ElementD = int32_t;
using ElementAccumulator = int32_t;
using ElementCompute = int32_t;
using ElementBias = int32_t;
using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
EXPECT_TRUE(pass);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////// 64x256x128 Cluster2x4x1 TMEM 2x2 ////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4x1) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::ColumnMajor;
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementD = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
ElementD, LayoutC, 16 / sizeof(ElementD),
EpilogueSchedule
>::CollectiveOp;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
ElementA, LayoutA, 16 / sizeof(ElementA),
ElementB, LayoutB, 16 / sizeof(ElementB),
ElementAccumulator,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopSchedule
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using namespace test::gemm::device;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,293 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/// A Row B Col
TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = cutlass::float_e4m3_t;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 128,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Row
TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = cutlass::float_e4m3_t;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 128,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Row B Row
TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = cutlass::float_e4m3_t;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 128,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Col
TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e2m1_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = cutlass::float_e4m3_t;
using ElementD = cutlass::float_e4m3_t;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 128,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 2.0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,281 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/// A Row B Col
TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
ElementD, GmemLayoutC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
ElementA, GmemLayoutA, 16,
ElementB, GmemLayoutB, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Row
TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
ElementD, GmemLayoutC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
ElementA, GmemLayoutA, 16,
ElementB, GmemLayoutB, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Row B Row
TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
ElementD, GmemLayoutC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
ElementA, GmemLayoutA, 16,
ElementB, GmemLayoutB, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Col
TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
ElementD, GmemLayoutC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
ElementA, GmemLayoutA, 16,
ElementB, GmemLayoutB, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,293 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/thread/activation.h"
#include "../../common/cutlass_unit_test.h"
#include "gemm_testbed_3x_ptr_array.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
/// A Row B Col
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 16,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Row
TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 16,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Row B Row
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 16,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
/// A Col B Col
TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = void;
using ElementD = float;
using ElementCompute = float;
using ElementAccumulator = float;
using ElementSF = cutlass::float_ue8m0_t;
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
using ElementAccumulator = float;
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
//
// Construct CollectiveEpilogue
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
ElementD, GmemLayoutC *, 16,
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
>::CollectiveOp;
//
// Construct CollectiveMainloop
//
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
MmaTypePairA, GmemLayoutA *, 16,
MmaTypePairB, GmemLayoutB *, 16,
ElementAccumulator,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
CollectiveMainloop,
CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -87,6 +87,7 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align1, 6
/////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
using ElementOutput = float;
@ -124,6 +125,8 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 1
EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
}
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -2974,6 +2974,7 @@ TEST(SM80_gemm_threadblock_crosswise,
}
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x64x1024_64x64x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3006,8 +3007,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x64x1024_32x32x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3040,8 +3044,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_128x64x1024_64x32x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3074,8 +3081,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x1024x1024_32x64x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3108,8 +3118,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_128x1024x1024_64x64x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3142,8 +3155,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
multicta_256x256x6144_128x1024x1024_64x64x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3176,8 +3192,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
multicta_512x256x6144_256x1024x1024_64x64x1024_16x8x256_3stage) {
using ElementA = cutlass::uint1b_t;
@ -3210,8 +3229,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x64x512_64x64x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3244,8 +3266,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x64x512_32x32x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3278,8 +3303,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_128x64x512_64x32x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3312,8 +3340,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_64x128x512_32x64x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3346,8 +3377,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
tensor_op_128x128x512_64x64x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3380,8 +3414,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
multicta_256x256x6144_128x128x512_64x64x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3414,8 +3451,11 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if 0
TEST(SM80_gemm_threadblock_crosswise,
multicta_512x256x6144_256x128x512_64x64x512_16x8x256_4stage) {
using ElementA = cutlass::uint1b_t;
@ -3448,6 +3488,8 @@ TEST(SM80_gemm_threadblock_crosswise,
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif
////////////////////////////////////////////////////////////////////////////////
TEST(SM80_gemm_threadblock_congruous,
tensor_op_64x64x16_32x64x16_8x8x4_3stage) {

View File

@ -31,6 +31,7 @@ cutlass_test_unit_add_executable(
pipeline_tma_async.cu
pipeline_tma_async_warp_specialized.cu
pipeline_tma_async_warp_specialized_persistent.cu
pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
pipeline_async.cu
sequence_barrier.cu
)

View File

@ -0,0 +1,381 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit test for the PipelineCLCFetchAsync class
*/
//
//
#define KERNEL_DBG_TRACE false
#include <cuda/atomic>
#include "../common/cutlass_unit_test.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <cute/tensor.hpp>
#include <cute/arch/cluster_sm90.hpp>
#include <cutlass/util/reference/host/gemm.h>
#include <cutlass/cluster_launch.hpp>
#include "cutlass/core_io.h"
#include "cutlass/util/print_error.hpp"
#include "cutlass/util/GPU_Clock.hpp"
#include "testbed_cluster_launch_control.h"
#include "cutlass/pipeline/pipeline.hpp"
#include "cutlass/arch/barrier.h"
#include "cute/arch/cluster_sm90.hpp"
#include "cutlass/arch/barrier.h"
#include "cutlass/arch/reg_reconfig.h"
#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
using namespace cute;
using namespace cutlass;
using namespace cutlass::gemm::kernel::detail;
//////////////////// Shared Memory /////////////////////////
template <uint32_t Stages, typename ClusterShape>
struct SharedStorage
{
alignas(16) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::CLCResponse clc_response[Stages];
alignas(8) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::PipelineStorage storage ;
};
//////////////////// Kernel /////////////////////////
template <typename ClusterShape, uint32_t Stages>
__launch_bounds__(256, 1)
__global__ static
void pipeline_device(int *d_workerCount)
{
extern __shared__ char shared_memory[];
// single producer, multiple consumers
// producer: WG0
// consumer: WG1
using SharedStorage = SharedStorage<Stages, ClusterShape>;
using Scheduler = PersistentTileSchedulerSm100<ClusterShape, Stages>;
using TileSchedulingPipeline = typename Scheduler::Pipeline;
SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
// Logistics
int warp_idx = canonical_warp_idx();
auto cluster_shape = ClusterShape{};
typename TileSchedulingPipeline::Params params;
params.transaction_bytes = 16;
constexpr int NUM_PRODUCER = 32;
constexpr int NUM_CONSUMERS_PER_CTA = 32;
params.consumer_arv_count = NUM_PRODUCER + NUM_CONSUMERS_PER_CTA * cute::size<0>(cluster_shape) * cute::size<1>(cluster_shape);
params.producer_arv_count = 1;
// Only the first CTA in the Cluster is producing.
params.producer_blockid = 0;
dim3 block_id_in_cluster = cute::block_id_in_cluster();
// mbarrier.init
TileSchedulingPipeline scheduler_pipeline(shared_storage.storage, params );
Scheduler scheduler(&shared_storage.clc_response[0], typename Scheduler::Params{}, block_id_in_cluster);
// Ensure All CTAs in Cluster have completed init before issuing commits
cute::cluster_arrive_relaxed();
cute::cluster_wait();
uint32_t is_first_block_in_cluster = block_id_in_cluster.x == 0 && block_id_in_cluster.y == 0;
int lane_predicate = cute::elect_one_sync();
uint32_t is_producer = (is_first_block_in_cluster && warp_idx == 0);
uint32_t is_consumer = (warp_idx == 4);
PipelineState<Stages> scheduler_pipe_state;
PipelineState<Stages> scheduler_pipe_state_write = cutlass::make_producer_start_state<TileSchedulingPipeline>();
typename Scheduler::WorkTileInfo work_tile_info = {
static_cast<int32_t>(blockIdx.x),
static_cast<int32_t>(blockIdx.y),
static_cast<int32_t>(blockIdx.z),
false
};
// Persistent loop
do {
// Producer
if (is_producer) {
// Only 1 thread of the entire cluster issues the query.
scheduler_pipe_state_write = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_state_write);
}
// Consumers
if (is_consumer) {
int linearCLC = work_tile_info.N_idx * gridDim.x + work_tile_info.M_idx;
// Atomically increment the worker count for the linearCLC by 1.
if (lane_predicate) {
atomicAdd(&d_workerCount[linearCLC], 1);
}
}
// Union of all consumers. Note that the producer here is its own consumer.
if (is_producer || is_consumer) {
scheduler_pipeline.consumer_wait(scheduler_pipe_state);
work_tile_info = scheduler.get_current_work(scheduler_pipe_state);
scheduler_pipeline.consumer_release(scheduler_pipe_state);
++scheduler_pipe_state;
// Add block offset since the scheduler works at cluster level.
dim3 block_id_in_cluster = cute::block_id_in_cluster();
work_tile_info.M_idx += block_id_in_cluster.x;
work_tile_info.N_idx += block_id_in_cluster.y;
work_tile_info.L_idx += block_id_in_cluster.z;
}
} while (work_tile_info.is_valid_tile);
// End of kernel
cute::cluster_sync();
}
/////////////////////////////////////////////////////
template<uint32_t Stages_, typename ClusterShape_>
struct PipelineTest {
//
// Data members
//
static constexpr uint32_t Stages = Stages_;
static constexpr uint32_t BlockSize = 128 * 2;
using ClusterShape = ClusterShape_;
//
// Methods
//
bool check_results(int *h_workerCount, int size ) {
for (int i = 0 ; i< size; i++ ){
if ( h_workerCount[i] != 1 )
{
std::cout << "linearCLC " << i << " has worker count " << h_workerCount[i] << "\n";
return false;
}
}
return true;
}
// Run CuTe GEMM kernel
cudaError_t run(bool &success, dim3 grid_dim,
cudaStream_t stream = 0 ) {
//
// Configure and launch
//
cudaError_t result;
int smem_size = 192 * 1024; // 192kB to force 1CTA/SM
auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
// Launch a single Cluster, with BlockSize threads per CTA
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);
dim3 dimGrid = grid_dim;
dim3 dimBlock(BlockSize,1,1);
result = cudaFuncSetAttribute(
pipeline_device<
decltype(cluster_shape),
Stages>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size
);
if (result != cudaSuccess) {
std::cerr << "Error: Failed to set Shared Memory size." << std::endl;
return result;
}
int array_size = dimGrid.x * dimGrid.y;
int *d_workerCount, *h_workerCount;
/* Allocate memory. workerCount[i] counts the number of worker(s) which work
on linear t i. The expectation is that workerCount[i] == 1 for all i.
*/
h_workerCount = (int*)malloc(array_size * sizeof(int));
result = cudaMalloc(&d_workerCount, array_size * sizeof(int));
if (result != cudaSuccess) {
std::cerr << "Failed to do cudaMalloc." << result << "\n";
return result;
}
for(int i = 0 ; i < array_size; i++)
{
h_workerCount[i] = 0; // Initialize workerCount[i] to 0 for all i.
}
result = cudaMemcpy(d_workerCount, h_workerCount, array_size * sizeof(int), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
std::cerr << "Failed to do cudaMemcpy." << result << "\n";
return result;
}
// Extended launch API
const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
void* kernel_params[] = {&d_workerCount};
cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
return result;
}
result = cudaMemcpy(h_workerCount, d_workerCount, array_size * sizeof(int), cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
std::cerr << "Failed to do cudaMemcpy." << result << "\n";
return result;
}
success = check_results(h_workerCount, array_size);
free(h_workerCount);
result = cudaFree(d_workerCount);
if (result != cudaSuccess) {
std::cerr << "Failed to do cudaFree." << result << "\n";
return result;
}
return cudaSuccess;
}
};
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//Cluster1x2 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x2_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster2x1 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x1_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster2x2 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x2_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster1x1 Stage3
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x1_Stage3) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
static constexpr uint32_t Stages = 3;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster1x4 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x4_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster4x1 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x1_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster2x4 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x4_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster4x2 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x2_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
//Cluster4x4 Stage4
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x4_Stage4) {
Options options;
options.grid_dim = {32,32,1};
using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
static constexpr uint32_t Stages = 4;
using Test = PipelineTest<Stages, ClusterShape>;
Testbed<Test> testbed(options);
EXPECT_TRUE(testbed.verification());
}
#endif

View File

@ -0,0 +1,154 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Testbed file used by cluster launch control pipeline unit test
*/
//
//
#if CUDA_12_0_SM90_FEATURES_SUPPORTED
#define CUTLASS_UNIT_TEST_PIPELINE true
#else
#define CUTLASS_UNIT_TEST_PIPELINE false
#endif
#include <cstdlib>
#include <cstdio>
#include <cassert>
#include <cutlass/gemm/gemm.h>
#include "cutlass/util/command_line.h"
// Command line test options
struct Options {
//
// Data Members
//
bool help = false;
bool verification_enabled = true;
int SM_count = 116;
int clock_MHz = 1477;
dim3 grid_dim = {0,0,0};
//
// Methods
//
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("verification-enabled", verification_enabled, verification_enabled);
cmd.get_cmd_line_argument("sm-count", SM_count, SM_count);
cmd.get_cmd_line_argument("clock", clock_MHz, clock_MHz);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --verification-enabled=<bool> Enable/Disable verification\n"
<< " --sm-count=<int> Number of SMs on the chip\n"
<< " --clock=<int> Locked clock value in Mhz\n";
return out;
}
};
//
// Testbed
//
template<typename Pipeline>
class Testbed {
private:
// Commandline options
Options options;
bool run_test() {
// Run CuTe Gemm
Pipeline pipeline;
bool success = false;
cudaError_t result = pipeline.run(success, this->options.grid_dim);
CUTE_CHECK_LAST();
return success;
}
public:
Testbed(Options const &options_) : options(options_) {
int device_id = 0;
cudaDeviceProp device_prop;
CUTE_CHECK_ERROR(cudaSetDevice(device_id));
CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
if (device_prop.major < 1) {
fprintf(stderr, "Device does not support CUDA.\n");
exit(1);
}
}
/// Run verification Gemm problem sizes
bool verification() {
#if !defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
printf(
"CUTLASS_ARCH_MMA_SM100_SUPPORTED must be set, but it is not. \n"
"This test is waived.\n"
);
return true;
#endif
#if 1
bool is_success = false;
for (int i = 0; i< 10; i++){
printf("iteration = %d\n", i);
is_success = run_test();
if ( not is_success )
return is_success;
}
return is_success;
#else
// Run the test with single launch
return run_test();
#endif
}
};