CUTLASS 3.8 Release (#2059)
* CUTLASS 3.8 Release
* update
* Update README.md
* Revert "Update README.md"
This reverts commit b353e36fe8.
* update
* update
---------
Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
@ -118,6 +118,7 @@ void FilterArchitecture() {
|
||||
{ "SM80*", 80, kMaxDevice},
|
||||
{ "SM89*", 89, 89},
|
||||
{ "SM90*", 90, 90},
|
||||
{ "SM100*", 100, 100},
|
||||
{ 0, 0, false }
|
||||
};
|
||||
|
||||
|
||||
@ -679,6 +679,11 @@ struct GetName<cutlass::float_e4m3_t> {
|
||||
static constexpr char name[] = "float_e4m3_t";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GetName<cutlass::float_e5m2_t> {
|
||||
static constexpr char name[] = "float_e5m2_t";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GetName<cutlass::half_t> {
|
||||
static constexpr char name[] = "half_t";
|
||||
@ -724,13 +729,20 @@ using VectorConvertTypes = ::testing::Types<
|
||||
ResultSourcePair<cutlass::bfloat16_t, uint8_t>,
|
||||
ResultSourcePair<cutlass::bfloat16_t, int8_t>,
|
||||
|
||||
ResultSourcePair<cutlass::float_e4m3_t, cutlass::int2b_t>,
|
||||
ResultSourcePair<cutlass::float_e5m2_t, cutlass::int2b_t>,
|
||||
ResultSourcePair<cutlass::half_t, cutlass::int2b_t>,
|
||||
ResultSourcePair<cutlass::bfloat16_t, cutlass::int2b_t>,
|
||||
ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint2b_t>,
|
||||
ResultSourcePair<cutlass::float_e5m2_t, cutlass::uint2b_t>,
|
||||
ResultSourcePair<cutlass::half_t, cutlass::uint2b_t>,
|
||||
ResultSourcePair<cutlass::bfloat16_t, cutlass::uint2b_t>,
|
||||
|
||||
ResultSourcePair<cutlass::float_e4m3_t, cutlass::int4b_t>,
|
||||
ResultSourcePair<cutlass::float_e5m2_t, cutlass::int4b_t>,
|
||||
ResultSourcePair<cutlass::half_t, cutlass::int4b_t>,
|
||||
ResultSourcePair<cutlass::bfloat16_t, cutlass::int4b_t>,
|
||||
ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint4b_t>,
|
||||
ResultSourcePair<cutlass::half_t, cutlass::uint4b_t>,
|
||||
ResultSourcePair<cutlass::bfloat16_t, cutlass::uint4b_t>,
|
||||
ResultSourcePair<float, cutlass::int4b_t>
|
||||
|
||||
@ -29,6 +29,10 @@
|
||||
add_custom_target(cutlass_test_unit_gemm_device)
|
||||
add_custom_target(test_unit_gemm_device)
|
||||
|
||||
|
||||
add_subdirectory(sm100_blockscaled_tensorop_gemm)
|
||||
|
||||
|
||||
################################################################################
|
||||
|
||||
function(cutlass_test_unit_gemm_device_add_deps NAME)
|
||||
@ -433,12 +437,12 @@ cutlass_test_unit_gemm_device_add_executable(
|
||||
gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
|
||||
gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
|
||||
|
||||
sm80_gemm_f64_f64_f64_tensor_op_f64.cu
|
||||
|
||||
# SM90 device level tests
|
||||
gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
|
||||
gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
|
||||
|
||||
sm80_gemm_f64_f64_f64_tensor_op_f64.cu
|
||||
|
||||
gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
|
||||
gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
|
||||
gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
|
||||
@ -821,3 +825,147 @@ if (CUTLASS_NVCC_DEVICE_COMPILE)
|
||||
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_sm100_fp16_gemm
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_gemm_f16_f16_f32_tensor_op_f32.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_tensorop_sm100_stream_k
|
||||
|
||||
sm100_gemm_f16_f16_f16_tensor_op_f32_stream_k.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_sm100_bf16_gemm
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
|
||||
)
|
||||
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_tensorop_stride_batch_alpha_beta_sm100
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_gemm_f8_f8_f8_tensor_op_s32_batch_alpha_beta.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_tensorop_runtime_datatype_sm100
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_gemm_f8_f8_f8_tensor_op_f32_runtime_datatype.cu
|
||||
sm100_gemm_f6_f6_f32_tensor_op_f32_runtime_datatype.cu
|
||||
sm100_gemm_f4_f4_f32_tensor_op_f32_runtime_datatype.cu
|
||||
sm100_gemm_f8_f4_f32_tensor_op_f32_runtime_datatype.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array
|
||||
|
||||
# 14 (9 + 5) unit tests
|
||||
sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
|
||||
sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_16b_tensorop_sm100_group_gemm
|
||||
|
||||
sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_16b_mixed_tensorop_sm100_ptr_array
|
||||
|
||||
# 14 (9 + 5) unit tests
|
||||
sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
|
||||
sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_32b_tensorop_sm100_ptr_array
|
||||
|
||||
# 10 unit tests
|
||||
sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_32b_tensorop_sm100_group_gemm
|
||||
|
||||
# 10 unit tests
|
||||
sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_8b_tensorop_sm100_ptr_array
|
||||
|
||||
# 12 unit tests
|
||||
sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
|
||||
sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_8b_tensorop_sm100_group_gemm
|
||||
|
||||
# 8 unit tests
|
||||
sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_mxf8_training_sm100_group_gemm
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_gemm_device_mxf4xmxf8_sm100_group_gemm
|
||||
|
||||
# 8 unit tests
|
||||
sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_ptr_array
|
||||
|
||||
# 8 unit tests
|
||||
sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_group_gemm_1
|
||||
|
||||
# 8 unit tests
|
||||
sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
|
||||
)
|
||||
cutlass_test_unit_gemm_device_add_executable(
|
||||
cutlass_test_unit_blockscaled_gemm_device_fp6_tensorop_sm100_ptr_array
|
||||
|
||||
# 8 unit tests
|
||||
sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -111,6 +111,18 @@ struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutpu
|
||||
using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
|
||||
};
|
||||
|
||||
|
||||
template <typename Gemm, typename = void>
|
||||
struct IsF8F6F4Kernel {
|
||||
static constexpr bool value = false;
|
||||
};
|
||||
|
||||
template <typename Gemm>
|
||||
struct IsF8F6F4Kernel<Gemm, std::void_t<decltype(Gemm::GemmKernel::CollectiveMainloop::IsF8F6F4)>> {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
|
||||
|
||||
// The maximum swizzle size to use
|
||||
//
|
||||
// This class, like Splits above makes it harder to confuse
|
||||
@ -212,9 +224,26 @@ bool initialize_tensor(
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
|
||||
else if (bits_input <= 6) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
}
|
||||
|
||||
else if (bits_input <= 8) {
|
||||
|
||||
if constexpr (
|
||||
cute::is_same_v<Element, cutlass::float_ue8m0_t>){
|
||||
scope_max = 4;
|
||||
scope_min = 1;
|
||||
}
|
||||
else {
|
||||
|
||||
scope_max = 1;
|
||||
scope_min = -1;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else{
|
||||
scope_max = 4;
|
||||
@ -487,6 +516,277 @@ struct HostCollectiveMainloop {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
|
||||
//
|
||||
template<
|
||||
class Gemm,
|
||||
int SchedulerPipelineStageCount_,
|
||||
int AccumulatorPipelineStageCount_,
|
||||
class ElementA_,
|
||||
class ElementB_
|
||||
>
|
||||
struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<
|
||||
SchedulerPipelineStageCount_,
|
||||
AccumulatorPipelineStageCount_>,
|
||||
Gemm, ElementA_, ElementB_> {
|
||||
// Kernel data types
|
||||
using ElementA = ElementA_;
|
||||
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||
using InternalStrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using ElementB = ElementB_;
|
||||
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||
using InternalStrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
|
||||
using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
|
||||
using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
|
||||
|
||||
static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
|
||||
|
||||
using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
|
||||
using ElementScalingFactor = ElementAccumulator;
|
||||
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
|
||||
using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
|
||||
|
||||
static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
|
||||
|
||||
using ElementSF = typename Gemm::GemmKernel::ElementSF;
|
||||
using Sm100BlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
|
||||
using Blk_MN = typename Sm100BlkScaledConfig::Blk_MN;
|
||||
using Blk_SF = typename Sm100BlkScaledConfig::Blk_SF;
|
||||
using SfAtom = typename Sm100BlkScaledConfig::SfAtom;
|
||||
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
|
||||
using InternalLayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
|
||||
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
|
||||
using InternalLayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
|
||||
|
||||
using Arguments = typename Gemm::GemmKernel::MainloopArguments;
|
||||
|
||||
// Whether to use relative equality checks
|
||||
CheckEquality check_relative_equality = CheckEquality::EXACT;
|
||||
|
||||
std::vector<InternalStrideA> stride_a_host;
|
||||
std::vector<InternalStrideB> stride_b_host;
|
||||
cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
|
||||
cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
|
||||
|
||||
std::vector<InternalLayoutSFA> layout_sfa_host;
|
||||
std::vector<InternalLayoutSFB> layout_sfb_host;
|
||||
cutlass::DeviceAllocation<InternalLayoutSFA> layout_sfa_device;
|
||||
cutlass::DeviceAllocation<InternalLayoutSFB> layout_sfb_device;
|
||||
|
||||
typename LayoutTagA::Stride stride_factor_A;
|
||||
typename LayoutTagB::Stride stride_factor_B;
|
||||
|
||||
cutlass::Distribution::Kind init_A;
|
||||
cutlass::Distribution::Kind init_B;
|
||||
|
||||
std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
|
||||
std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
|
||||
std::vector<cutlass::HostTensor<ElementSF, LayoutTagA>> tensors_SFA;
|
||||
std::vector<cutlass::HostTensor<ElementSF, LayoutTagB>> tensors_SFB;
|
||||
|
||||
cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
|
||||
cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
|
||||
cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFA;
|
||||
cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFB;
|
||||
|
||||
uint64_t seed;
|
||||
static constexpr uint64_t kDefaultSeed = 4096;
|
||||
|
||||
// Note: this limitation comes from testbed / not the library
|
||||
static_assert(is_row_or_col_major<InternalStrideA>(),
|
||||
"ERROR : A Layout is neither Row / Column Major)");
|
||||
static_assert(is_row_or_col_major<InternalStrideB>(),
|
||||
"ERROR : B Layout is neither Row / Column Major)");
|
||||
|
||||
HostCollectiveMainloop(
|
||||
CheckEquality check_relative_equality_ = CheckEquality::EXACT,
|
||||
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
|
||||
uint64_t seed_ = kDefaultSeed,
|
||||
typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
|
||||
typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
|
||||
):
|
||||
check_relative_equality(check_relative_equality_),
|
||||
stride_factor_A(stride_factor_A_),
|
||||
stride_factor_B(stride_factor_B_),
|
||||
init_A(init_A_), init_B(init_B_), seed(seed_) { }
|
||||
|
||||
template<class ProblemShapeType>
|
||||
bool initialize(ProblemShapeType problem_shapes) {
|
||||
//
|
||||
// Allocate the GEMM workspace
|
||||
//
|
||||
tensors_A.clear();
|
||||
tensors_B.clear();
|
||||
stride_a_host.clear();
|
||||
stride_b_host.clear();
|
||||
tensors_SFA.clear();
|
||||
tensors_SFB.clear();
|
||||
layout_sfa_host.clear();
|
||||
layout_sfb_host.clear();
|
||||
|
||||
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
|
||||
L = std::max(problem_shapes.groups(), L);
|
||||
|
||||
for (int32_t i = 0; i < L; ++i) {
|
||||
auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
|
||||
|
||||
stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
|
||||
stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
|
||||
|
||||
// 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
|
||||
auto a_coord = cutlass::make_Coord(M, K);
|
||||
// Cutlass has Row/Col major refers to MxK times KxN matrix product,
|
||||
// so the HostTensorB should be treated as KxN in "coord"'s view
|
||||
auto b_coord = cutlass::make_Coord(K, N);
|
||||
|
||||
tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
|
||||
tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
|
||||
|
||||
EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
|
||||
EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
|
||||
|
||||
// It is possible to randomly initialize to all zeros, so override this with non-zeros
|
||||
// in the upper left corner of each operand.
|
||||
tensors_A[i].host_view().at({0, 0}) = ElementA(1);
|
||||
tensors_B[i].host_view().at({0, 0}) = ElementB(1);
|
||||
|
||||
tensors_A[i].sync_device();
|
||||
tensors_B[i].sync_device();
|
||||
|
||||
using namespace cute;
|
||||
|
||||
auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
|
||||
auto m_blks = cutlass::ceil_div(M, Blk_MN{});
|
||||
auto n_blks = cutlass::ceil_div(N, Blk_MN{});
|
||||
layout_sfa_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
|
||||
layout_sfb_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
|
||||
|
||||
// 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
|
||||
auto sfa_coord = cutlass::make_Coord(m_blks * Blk_MN{}, k_blks * Blk_SF{});
|
||||
auto sfb_coord = cutlass::make_Coord(n_blks * Blk_MN{}, k_blks * Blk_SF{});
|
||||
|
||||
tensors_SFA.push_back(cutlass::HostTensor<ElementSF, LayoutTagA>(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A)));
|
||||
tensors_SFB.push_back(cutlass::HostTensor<ElementSF, LayoutTagB>(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B)));
|
||||
|
||||
EXPECT_TRUE(initialize_tensor(tensors_SFA[i].host_view(), init_A, seed + 2024 + i));
|
||||
EXPECT_TRUE(initialize_tensor(tensors_SFB[i].host_view(), init_B, seed + 2025 + i));
|
||||
|
||||
// It is possible to randomly initialize to all zeros, so override this with non-zeros
|
||||
// in the upper left corner of each operand.
|
||||
tensors_SFA[i].host_view().at({0, 0}) = ElementSF(1);
|
||||
tensors_SFB[i].host_view().at({0, 0}) = ElementSF(1);
|
||||
|
||||
tensors_SFA[i].sync_device();
|
||||
tensors_SFB[i].sync_device();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Arguments to_args(ProblemShapeType problem_shapes) {
|
||||
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
|
||||
L = std::max(problem_shapes.groups(), L);
|
||||
|
||||
std::vector<ElementA *> ptr_A_host(L);
|
||||
std::vector<ElementB *> ptr_B_host(L);
|
||||
std::vector<ElementSF *> ptr_SFA_host(L);
|
||||
std::vector<ElementSF *> ptr_SFB_host(L);
|
||||
|
||||
for (int32_t i = 0; i < L; ++i) {
|
||||
ptr_A_host.at(i) = tensors_A[i].device_data();
|
||||
ptr_B_host.at(i) = tensors_B[i].device_data();
|
||||
ptr_SFA_host.at(i) = tensors_SFA[i].device_data();
|
||||
ptr_SFB_host.at(i) = tensors_SFB[i].device_data();
|
||||
}
|
||||
|
||||
device_tensors_A.reset(L);
|
||||
device_tensors_A.copy_from_host(ptr_A_host.data());
|
||||
|
||||
device_tensors_B.reset(L);
|
||||
device_tensors_B.copy_from_host(ptr_B_host.data());
|
||||
|
||||
device_tensors_SFA.reset(L);
|
||||
device_tensors_SFA.copy_from_host(ptr_SFA_host.data());
|
||||
|
||||
device_tensors_SFB.reset(L);
|
||||
device_tensors_SFB.copy_from_host(ptr_SFB_host.data());
|
||||
|
||||
stride_a_device.reset(problem_shapes.groups());
|
||||
stride_a_device.copy_from_host(stride_a_host.data());
|
||||
|
||||
stride_b_device.reset(problem_shapes.groups());
|
||||
stride_b_device.copy_from_host(stride_b_host.data());
|
||||
|
||||
layout_sfa_device.reset(problem_shapes.groups());
|
||||
layout_sfa_device.copy_from_host(layout_sfa_host.data());
|
||||
|
||||
layout_sfb_device.reset(problem_shapes.groups());
|
||||
layout_sfb_device.copy_from_host(layout_sfb_host.data());
|
||||
|
||||
if constexpr (IsGroupGemm) {
|
||||
return Arguments{
|
||||
device_tensors_A.get(), stride_a_device.get(),
|
||||
device_tensors_B.get(), stride_b_device.get(),
|
||||
device_tensors_SFA.get(), layout_sfa_device.get(),
|
||||
device_tensors_SFB.get(), layout_sfb_device.get()
|
||||
};
|
||||
}
|
||||
else {
|
||||
return Arguments{
|
||||
device_tensors_A.get(), stride_a_host[0],
|
||||
device_tensors_B.get(), stride_b_host[0],
|
||||
device_tensors_SFA.get(), layout_sfa_host[0],
|
||||
device_tensors_SFB.get(), layout_sfb_host[0]
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
auto to_host_args(ProblemShapeType problem_shapes, int batch) {
|
||||
using namespace cute;
|
||||
//
|
||||
// Allocate the GEMM workspace
|
||||
//
|
||||
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
|
||||
auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
|
||||
make_layout(make_shape(M, K, 1), stride_a_host[batch]));
|
||||
auto SfA = make_tensor(tensors_SFA[batch].host_data(), layout_sfa_host[batch]);
|
||||
|
||||
auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
|
||||
make_layout(make_shape(N, K, 1), stride_b_host[batch]));
|
||||
auto SfB = make_tensor(tensors_SFB[batch].host_data(), layout_sfb_host[batch]);
|
||||
|
||||
return cutlass::reference::host::GettMainloopParams<ElementAccumulator,
|
||||
decltype(A),
|
||||
decltype(B),
|
||||
decltype(SfA),
|
||||
decltype(SfB)
|
||||
>
|
||||
{A, SfA, B, SfB};
|
||||
}
|
||||
|
||||
void print_tensors(std::ofstream& file, int batch) {
|
||||
file << "A =\n" << tensors_A[batch].host_view()
|
||||
<< "\nB =\n" << tensors_B[batch].host_view()
|
||||
<< "\nSFA =\n" << tensors_SFA[batch].host_view()
|
||||
<< "\nSFB =\n" << tensors_SFB[batch].host_view();
|
||||
}
|
||||
|
||||
bool compare_reference(
|
||||
ProblemShapeType problem_shapes, int batch) {
|
||||
|
||||
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
|
||||
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
|
||||
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFA[batch].host_view()), 0);
|
||||
EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFB[batch].host_view()), 0);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class Gemm>
|
||||
struct HostCollectiveDefaultEpilogue {
|
||||
// fusion types are potentially void if the fusion is not supported
|
||||
@ -803,6 +1103,24 @@ struct HostCollectiveEpilogue {
|
||||
using FusionOp = typename Gemm::EpilogueOutputOp;
|
||||
static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
|
||||
|
||||
|
||||
// Scale factor Generation related
|
||||
using SfStrategy = cutlass::reference::host::SfStrategy;
|
||||
static constexpr bool IsBlockScaleSupported = FusionOp::IsBlockScaleSupported;
|
||||
static constexpr SfStrategy SfGenStrategy = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
|
||||
static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
|
||||
using ElementSFD = non_void_t<cute::remove_pointer_t<typename FusionOp::ElementBlockScaleFactor>, ElementD>;
|
||||
using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
|
||||
SFD_VectorSize
|
||||
>;
|
||||
using Blk_MN = typename Sm100BlockScaledOutputConfig::Blk_MN;
|
||||
using Blk_SF = typename Sm100BlockScaledOutputConfig::Blk_SF;
|
||||
using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
|
||||
std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> tensors_SFD;
|
||||
std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> references_SFD;
|
||||
cutlass::DeviceAllocation<ElementSFD *> device_tensors_SFD;
|
||||
|
||||
|
||||
using ElementCompute = typename FusionOp::ElementCompute;
|
||||
using ElementScalar = typename FusionOp::ElementScalar;
|
||||
using ElementBias = non_void_t<typename FusionOp::ElementBias>;
|
||||
@ -904,6 +1222,11 @@ struct HostCollectiveEpilogue {
|
||||
references_D.clear();
|
||||
stride_c_host.clear();
|
||||
stride_d_host.clear();
|
||||
|
||||
tensors_SFD.clear();
|
||||
references_SFD.clear();
|
||||
|
||||
|
||||
auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
|
||||
L = std::max(problem_shapes.groups(), L);
|
||||
|
||||
@ -1034,6 +1357,26 @@ struct HostCollectiveEpilogue {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
for (int32_t i = 0; i < L; ++i) {
|
||||
auto [M, N, K, _] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
|
||||
// If block scaled output is supported we always have at least 1 SFD
|
||||
auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
|
||||
auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
|
||||
auto sfd_coord = [&] () {
|
||||
return cutlass::make_Coord(m_blks * Blk_MN{}, n_blks * Blk_SF{});
|
||||
}();
|
||||
tensors_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D)));
|
||||
references_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false));
|
||||
tensors_SFD[i].sync_device();
|
||||
}
|
||||
norm_constant.resize(scalar_coord, true);
|
||||
EXPECT_TRUE(initialize_tensor(norm_constant.host_view(), init_scale, seed + 2023));
|
||||
norm_constant.sync_device();
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1116,6 +1459,17 @@ struct HostCollectiveEpilogue {
|
||||
passed &= tmp;
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
tensors_SFD[batch].sync_host();
|
||||
bool passed_sf = equality_check(references_SFD[batch].host_view(), tensors_SFD[batch].host_view());
|
||||
if(!passed_sf) {
|
||||
std::cout<<"SF is incorrect"<<std::endl;
|
||||
}
|
||||
passed &= passed_sf;
|
||||
}
|
||||
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
@ -1308,6 +1662,19 @@ struct HostCollectiveEpilogue {
|
||||
fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
std::vector<ElementSFD *> ptr_SFD_host(L);
|
||||
for (int32_t i = 0; i < L; ++i) {
|
||||
ptr_SFD_host.at(i) = tensors_SFD[i].device_data();
|
||||
}
|
||||
device_tensors_SFD.reset(L);
|
||||
device_tensors_SFD.copy_from_host(ptr_SFD_host.data());
|
||||
|
||||
arguments.thread.block_scale_factor_ptr = device_tensors_SFD.get();
|
||||
arguments.thread.norm_constant_ptr = norm_constant.device_data();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return arguments;
|
||||
@ -1341,6 +1708,20 @@ struct HostCollectiveEpilogue {
|
||||
cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, M)));
|
||||
auto Vbeta = cute::make_tensor(detail::make_iterator(beta.host_data()),
|
||||
cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, N)));
|
||||
|
||||
auto SfD = [&](){
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
auto tensor = make_tensor(detail::make_iterator(references_SFD[batch].host_data()),
|
||||
Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
|
||||
return tensor;
|
||||
}
|
||||
else {
|
||||
// Reference kernel has a logic to ignore scalefactor computation if we pass the tensor type same as output D tensor.
|
||||
return D;
|
||||
}
|
||||
}();
|
||||
|
||||
|
||||
cutlass::reference::host::GettEpilogueParams<
|
||||
ElementScalar,
|
||||
ElementScalar,
|
||||
@ -1353,8 +1734,11 @@ struct HostCollectiveEpilogue {
|
||||
decltype(Valpha),
|
||||
decltype(Vbeta),
|
||||
ActivationFunctor
|
||||
, decltype(SfD)
|
||||
, Int<SFD_VectorSize>
|
||||
, cutlass::plus<ElementCompute>
|
||||
, false
|
||||
, SfGenStrategy
|
||||
> epilogue_params{};
|
||||
|
||||
epilogue_params.C = C;
|
||||
@ -1397,6 +1781,12 @@ struct HostCollectiveEpilogue {
|
||||
epilogue_params.Vbeta = Vbeta;
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
epilogue_params.SfD = SfD;
|
||||
epilogue_params.st = norm_constant.at(coord_0);
|
||||
}
|
||||
|
||||
return epilogue_params;
|
||||
}
|
||||
};
|
||||
@ -1812,8 +2202,24 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
|
||||
using ElementB = typename Gemm::GemmKernel::ElementB;
|
||||
using TiledMma = typename Gemm::GemmKernel::TiledMma;
|
||||
int alignment_bits = 128;
|
||||
|
||||
static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
|
||||
alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
|
||||
// For fp4 and fp6 mx kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
|
||||
|
||||
int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);
|
||||
|
||||
|
||||
if constexpr (apply_alignment_offset) {
|
||||
// If BlockScaled, then min alignment is SFVecSize
|
||||
static constexpr bool IsBlockScaleSupported = Gemm::EpilogueOutputOp::IsBlockScaleSupported;
|
||||
static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
|
||||
if constexpr (IsBlockScaleSupported) {
|
||||
alignment_input = cutlass::round_up(alignment_input, SFVecSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
|
||||
using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
|
||||
CtaShape_MNK cta_shape;
|
||||
|
||||
@ -258,6 +258,12 @@ struct Testbed3xTensorBroadcast {
|
||||
cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
|
||||
auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
|
||||
cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
|
||||
|
||||
auto dummy_SFD = cute::make_tensor(static_cast<ElementD*>(nullptr),
|
||||
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
|
||||
using DummySFDVectorSize = cute::Int<0>;
|
||||
|
||||
|
||||
cutlass::reference::host::GettEpilogueParams<
|
||||
ElementScalar,
|
||||
ElementScalar,
|
||||
@ -270,6 +276,8 @@ struct Testbed3xTensorBroadcast {
|
||||
decltype(dummy_Valpha),
|
||||
decltype(dummy_Vbeta),
|
||||
ActivationFunctor,
|
||||
decltype(dummy_SFD),
|
||||
DummySFDVectorSize,
|
||||
cutlass::plus<ElementCompute>,
|
||||
PerColBias> epilogue_params{
|
||||
alpha,
|
||||
|
||||
@ -0,0 +1,150 @@
|
||||
# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
|
||||
#
|
||||
|
||||
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
|
||||
add_custom_target(
|
||||
cutlass_test_unit_gemm_device_sm100_blockscaled
|
||||
DEPENDS
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
nvf4_nvf4_bf16_bf16.cu
|
||||
nvf4_nvf4_bf16_bf16_features.cu
|
||||
nvf4_nvf4_f16_nvfp4_epilogue.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf4_mxf4_void_f16_tn_layout.cu
|
||||
mxf4_mxf4_void_f16_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf6_mxf6_void_bf16_tn_layout.cu
|
||||
mxf6_mxf6_void_bf16_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf8_mxf8_void_f8_tn_layout.cu
|
||||
mxf8_mxf8_void_f8_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf6_mxf8_void_f32_tn_layout.cu
|
||||
mxf6_mxf8_void_f32_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf8_mxf6_f16_f8_tn_layout.cu
|
||||
mxf8_mxf6_f16_f8_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf4_mxf8_bf16_bf16_tn_layout.cu
|
||||
mxf4_mxf8_bf16_bf16_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf8_mxf4_f16_bf16_tn_layout.cu
|
||||
mxf8_mxf4_f16_bf16_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf6_mxf4_f16_f16_tn_layout.cu
|
||||
mxf6_mxf4_f16_f16_nt_layout.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
|
||||
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
mxf4_mxf6_f32_f16_tn_layout.cu
|
||||
mxf4_mxf6_f32_f16_nt_layout.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
@ -0,0 +1,303 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|
||||
|--------|---------------|----|----|-------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x128x256_1x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256x256x256_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,523 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|
||||
|--------|---------------|-------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x192x256_1x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x192x256_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x256x256_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,304 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|
||||
|--------|---------------|----|----|-------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = float;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|
||||
|--------|---------------|-------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = float;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x192x128_2x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = float;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = float;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = float;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|
||||
|--------|---------------|----|----|--------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 128x192x128_1x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,304 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|
||||
|--------|---------------|----|----|-------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|
||||
|--------|---------------|-------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x256x128_1x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::half_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,304 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
|
||||
|--------|---------------|----|----|-------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported depends on the layout for mxfp6 mixed precision GEMM
|
||||
The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
|
||||
|--------|---------------|-------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,523 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|
||||
|--------|---------------|----|----|--------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x128x128_4x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignA = 128;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 4;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = float;
|
||||
constexpr int AlignD = 4;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,304 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 16 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|
||||
|--------|---------------|----|----|--------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,523 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x256x128_4x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x192x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,304 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 16 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|
||||
|--------|---------------|----|----|--------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x128x128_1x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* B tensor:
|
||||
* Types: {e2m3,e3m2}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | N | N | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | N | N | Y |
|
||||
| 2SM | 256x192x128 | Y | N | N | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
|
||||
constexpr int AlignB = 128;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,523 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 128 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
|
||||
|--------|---------------|----|----|--------|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x128x128_1x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x192x128_1x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,524 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 16 elements
|
||||
* B tensor:
|
||||
* Types: {e5m2,e4m3}xue8m0
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 16 elements
|
||||
* Mma Tile Shapes supported:
|
||||
For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x192x128 | Y | Y | Y | Y |
|
||||
| 1SM | 128x256x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x128x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x192x128 | Y | Y | Y | Y |
|
||||
| 2SM | 256x256x128 | Y | Y | Y | Y |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
// For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
constexpr int AlignA = 16;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
|
||||
constexpr int AlignB = 16;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = void;
|
||||
constexpr int AlignC = 16;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e5m2_t;
|
||||
constexpr int AlignD = 16;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_128>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,683 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit tests for nvfp4xnvfp4 Block Scaled Gemm
|
||||
|
||||
* A tensor:
|
||||
* Types: {e2m1}xue4m3
|
||||
* Layout: Row Major (T)
|
||||
* Alignment: 32 elements
|
||||
* B tensor:
|
||||
* Types: {e2m1}xue4m3
|
||||
* Layout: Column Major (N)
|
||||
* Alignment: 32 elements
|
||||
* Mma Tile Shapes supported:
|
||||
Support Matrix (Y: Yes, N: No)
|
||||
| 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
|
||||
|--------|---------------|--------|----|----|----|
|
||||
| 1SM | 128x128x256 | Y | N | N | N |
|
||||
| 1SM | 128x192x256 | Y | N | N | N |
|
||||
| 1SM | 128x256x256 | Y | N | N | N |
|
||||
| 2SM | 256x128x256 | Y | N | N | N |
|
||||
| 2SM | 256x192x256 | Y | N | N | N |
|
||||
| 2SM | 256x256x256 | Y | N | N | N |
|
||||
|
||||
(*) Unit tests in this file
|
||||
*/
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Using targeted scheduling with **static** cluster shapes
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_2x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Using large Cta Tiles: N=192 and N=256
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x192x256_2x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_192,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x256x256_2x1x1_1sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x192x256_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_192,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x256x256_2x4x1_2sm_auto) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_256,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,374 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Runtime data type for blockscaled gemm fp4
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Using Runtime Types
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x2x1_1sm_auto_runtime_dtypes) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto_runtime_dtypes) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Using Stream-K Scheduler
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_1x4x1_1sm_auto_streamK) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_1,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
// Tile Scheduler
|
||||
using TileScheduler = cutlass::gemm::StreamKScheduler;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
TileScheduler // Specify the streamK scheduler for the kernel
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto_streamK) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::bfloat16_t;
|
||||
constexpr int AlignD = 8;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Tile and cluster shapes
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
// Tile Scheduler
|
||||
using TileScheduler = cutlass::gemm::StreamKScheduler;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
TileScheduler // Specify the streamK scheduler for the kernel
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,436 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Unit test for nvfp4 Block Scaled Gemm with nvfp4 output
|
||||
D tensor:
|
||||
* Types: e2m1x{ue4m3}
|
||||
* Layout: Column Major (T)
|
||||
* Alignment: 32
|
||||
* Scale factors need to be generated with the fp4 output. It is generated along the continuous dimensions of the D tensor.
|
||||
* Meanwhile, before scale factor generation, it could have other epilogue fusion operation.
|
||||
* alpha
|
||||
* beta
|
||||
* activation
|
||||
* bias
|
||||
This UT tests
|
||||
- alpha + beta + scale-factor generation
|
||||
- alpha + beta + bias + scale-factor generation
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "../gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
|
||||
// with alpha/beta fusion
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstensorop_1sm_f32, 128x128x256_4x4x1) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e2m1_t;
|
||||
constexpr int AlignD = 32;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
// Describe SFD tensor
|
||||
using ElementSFD = cutlass::float_ue4m3_t;
|
||||
using GmemLayoutSFD = GmemLayoutD;
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
//
|
||||
// Construct FusionOperation
|
||||
//
|
||||
constexpr int SFDVectorSize = 16;
|
||||
// Define the fusion operation applied during epilogue
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
|
||||
SFDVectorSize,
|
||||
ElementD, ElementCompute,
|
||||
ElementSFD, GmemLayoutSFD,
|
||||
ElementC
|
||||
>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstensorop_2sm_f32, 256x128x256_4x4x1) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e2m1_t;
|
||||
constexpr int AlignD = 32;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
|
||||
// Describe SFD tensor
|
||||
using ElementSFD = cutlass::float_ue4m3_t;
|
||||
using GmemLayoutSFD = GmemLayoutD;
|
||||
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
//
|
||||
// Construct FusionOperation
|
||||
//
|
||||
constexpr int SFDVectorSize = 16;
|
||||
// Define the fusion operation applied during epilogue
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
|
||||
SFDVectorSize,
|
||||
ElementD, ElementCompute,
|
||||
ElementSFD, GmemLayoutSFD,
|
||||
ElementC
|
||||
>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 32)
|
||||
// with alpha/beta fusion
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstensorop_1sm_f32, 128x128x256_4x4x1) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e2m1_t;
|
||||
constexpr int AlignD = 32;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
// Describe SFD tensor
|
||||
using ElementSFD = cutlass::float_ue4m3_t;
|
||||
using GmemLayoutSFD = GmemLayoutD;
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
//
|
||||
// Construct FusionOperation
|
||||
//
|
||||
constexpr int SFDVectorSize = 32;
|
||||
// Define the fusion operation applied during epilogue
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
|
||||
SFDVectorSize,
|
||||
ElementD, ElementCompute,
|
||||
ElementSFD, GmemLayoutSFD,
|
||||
ElementC
|
||||
>;
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
|
||||
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
|
||||
ElementC, GmemLayoutC, AlignC, // C tensor description
|
||||
ElementD, GmemLayoutD, AlignD, // D tensor description
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
|
||||
ElementA, GmemLayoutA, AlignA, // A tensor elem type, layout and alignment requirement
|
||||
ElementB, GmemLayoutB, AlignB, // B tensor elem type, layout and alignment requirement
|
||||
ElementAccumulator, // Mma instruction accumulator type
|
||||
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
|
||||
// Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
// Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
// Run tests
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
// Check results
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
|
||||
// with alpha+beta+relu+bias fusion
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstensorop_1sm_f32_bias_relu, 128x128x256_4x4x1) {
|
||||
// Describe A and B tensors
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignA = 32;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
constexpr int AlignB = 32;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
// Describe C and D tensors
|
||||
using ElementC = cutlass::half_t;
|
||||
constexpr int AlignC = 8;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ElementD = cutlass::float_e2m1_t;
|
||||
constexpr int AlignD = 32;
|
||||
using GmemLayoutD = cutlass::layout::RowMajor;
|
||||
// Describe SFD tensor
|
||||
using ElementSFD = cutlass::float_ue4m3_t;
|
||||
using GmemLayoutSFD = GmemLayoutD;
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
// Bias type
|
||||
using ElementBias = float;
|
||||
|
||||
// Collective MMA takes tile shape of the MMA operation as input
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_256>;
|
||||
// Cluster size for multicast
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
// Collective Epilogue takes the output tile shape for 1 CTA
|
||||
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
|
||||
|
||||
// Mma's accumulator type
|
||||
using ElementAccumulator = float;
|
||||
// Epilogue computation's precision type
|
||||
using ElementCompute = float;
|
||||
constexpr int SFDVectorSize = 32;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasBlockScaleFactor<
|
||||
SFDVectorSize, ElementD, ElementCompute,
|
||||
ElementSFD, GmemLayoutSFD,
|
||||
ElementBias, ElementC
|
||||
>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
PerSmTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, AlignC,
|
||||
ElementD, GmemLayoutC, AlignD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
ElementA, GmemLayoutA, AlignA,
|
||||
ElementB, GmemLayoutB, AlignB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestAll<Gemm>();
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,364 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Ptr-Array GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_4x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_4,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_4x4x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::bfloat16_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::bfloat16_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
323
test/unit/gemm/device/sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
Normal file
323
test/unit/gemm/device/sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
Normal file
@ -0,0 +1,323 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/// A Row B Col
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::bfloat16_t;
|
||||
using ElementB = cutlass::bfloat16_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 16,
|
||||
ElementD, GmemLayoutC, 16,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, GmemLayoutA, 8,
|
||||
ElementB, GmemLayoutB, 8,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Row
|
||||
TEST(SM100_Device_Gemm_f16n_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::bfloat16_t;
|
||||
using ElementB = cutlass::bfloat16_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 16,
|
||||
ElementD, GmemLayoutC, 16,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, GmemLayoutA, 8,
|
||||
ElementB, GmemLayoutB, 8,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Row B Row
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::bfloat16_t;
|
||||
using ElementB = cutlass::bfloat16_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 16,
|
||||
ElementD, GmemLayoutC, 16,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, GmemLayoutA, 8,
|
||||
ElementB, GmemLayoutB, 8,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Col
|
||||
TEST(SM100_Device_Gemm_f16n_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::bfloat16_t;
|
||||
using ElementB = cutlass::bfloat16_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 16,
|
||||
ElementD, GmemLayoutC, 16,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, GmemLayoutA, 8,
|
||||
ElementB, GmemLayoutB, 8,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_bf16t_bf16t_bf32_void_f32n_tensor_op, 128x256x64_1x2x1) {
|
||||
using ElementA = cutlass::bfloat16_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::bfloat16_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using MmaTileShape = Shape<_128,_128,_64>;
|
||||
using TileShape_MNK = Shape<_128,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
void, LayoutC, 8,
|
||||
float, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,364 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Ptr-Array GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,606 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Grouped GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,665 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Ptr-Array GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = cutlass::half_t; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::half_t; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,250 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface with stream-K scheduling
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 128x256x64_1x2x1) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using TileShape_MNK = Shape<_128,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
cutlass::gemm::StreamKScheduler
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
|
||||
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x1x1) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
cutlass::gemm::StreamKScheduler
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
|
||||
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x256x64_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
cutlass::gemm::StreamKScheduler
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
|
||||
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
cutlass::gemm::StreamKScheduler
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
|
||||
bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
104
test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32.cu
Normal file
104
test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32.cu
Normal file
@ -0,0 +1,104 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32_void_f16n_tensor_op, 128x256x64_1x2x1) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using TileShape_MNK = Shape<_128,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
void, LayoutC, 8,
|
||||
cutlass::half_t, LayoutC, 8,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::half_t, LayoutA, 8,
|
||||
cutlass::half_t, LayoutB, 8,
|
||||
float,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,664 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Ptr-Array GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f16n_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::half_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::half_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,606 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Grouped GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,667 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide Ptr-Array GEMM interface
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
using namespace cute;
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 1.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_64,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_128,_64,_64>;
|
||||
using ClusterShape_MNK = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(3.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::RowMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_128,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
|
||||
// A matrix configuration
|
||||
using ElementA = float; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::ColumnMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
// B matrix configuration
|
||||
using ElementB = float; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
// C matrix configuration
|
||||
using ElementC = float; // Element type for C matrix operands
|
||||
using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// D matrix configuration
|
||||
using ElementD = float; // Element type for D matrix operands
|
||||
using LayoutD = cutlass::layout::ColumnMajor; // Layout type for D matrix operands
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using TileShape_MNK = Shape<_256,_256,_64>;
|
||||
using ClusterShape_MNK = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
|
||||
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
|
||||
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA, AlignmentA,
|
||||
ElementB, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
bool result = TestSmall<Gemm>(2.0, 2.0);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,327 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_256,_256>;
|
||||
using ClusterShape = Shape<_4,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementC),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA *, 32,
|
||||
MmaTypePairB, LayoutB *, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_384,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 4,
|
||||
ElementD, LayoutC *, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA *, 32,
|
||||
MmaTypePairB, LayoutB *, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_512,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 4,
|
||||
ElementD, LayoutC *, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA *, 32,
|
||||
MmaTypePairB, LayoutB *, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_256,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 4,
|
||||
ElementD, LayoutC *, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA *, 32,
|
||||
MmaTypePairB, LayoutB *, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_768,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 4,
|
||||
ElementD, LayoutC *, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA *, 32,
|
||||
MmaTypePairB, LayoutB *, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,327 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_4x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_256,_256>;
|
||||
using ClusterShape = Shape<_4,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 32,
|
||||
MmaTypePairB, LayoutB, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_384,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 32,
|
||||
MmaTypePairB, LayoutB, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_512,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 32,
|
||||
MmaTypePairB, LayoutB, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_256,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 32,
|
||||
MmaTypePairB, LayoutB, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_768,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 32,
|
||||
MmaTypePairB, LayoutB, 32,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,156 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 512x512x128_4x4x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_4,cute::_4,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
|
||||
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
|
||||
float,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_4,cute::_4,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
|
||||
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,486 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_128,_128,_256>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_512,_256>;
|
||||
using ClusterShape = Shape<_2,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_768,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_256,_256,_256>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_512,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_768,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e2m3_t;
|
||||
using ElementB = cutlass::float_e2m3_t;
|
||||
using ElementC = float;
|
||||
using ElementD = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
|
||||
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
|
||||
|
||||
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 4,
|
||||
ElementD, LayoutC, 4,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, LayoutA, 128,
|
||||
MmaTypePairB, LayoutB, 128,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,156 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
|
||||
cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 512x512x128_4x4x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_4,cute::_4,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
|
||||
cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_4,cute::_4,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,109 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_1,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
float, cutlass::layout::RowMajor, 4,
|
||||
cutlass::epilogue::TmaWarpSpecialized2Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
|
||||
float,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_1,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E2M1);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,504 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide Grouped GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_silu) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule,
|
||||
cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_voidC_silu) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutD = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
void, LayoutD *, 16 / sizeof(ElementD),
|
||||
ElementD, LayoutD *, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule,
|
||||
cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA *, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB *, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,465 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////// 128x128x128 //////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,297 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_1,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::epilogue::TmaWarpSpecialized2Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
cutlass::float_e4m3_t,
|
||||
float,
|
||||
cutlass::float_e4m3_t,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
|
||||
float,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_1,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
cutlass::float_e4m3_t,
|
||||
float,
|
||||
cutlass::float_e4m3_t,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e5m2n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
cutlass::float_e4m3_t,
|
||||
float,
|
||||
cutlass::float_e4m3_t,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E5M2);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
cutlass::float_e4m3_t,
|
||||
float,
|
||||
cutlass::float_e4m3_t,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
|
||||
float,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E4M3);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e5m2t_e5m2n_e5m2t_tensorop_2sm_f32_runtime_datatype, 256x256x128_2x2x1) {
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cute::Shape<cute::_128, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
float, float,
|
||||
cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||
|
||||
cutlass::epilogue::fusion::LinearCombination<
|
||||
cutlass::float_e5m2_t,
|
||||
float,
|
||||
cutlass::float_e5m2_t,
|
||||
float
|
||||
>
|
||||
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
|
||||
cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
|
||||
float,
|
||||
cute::Shape<cute::_256, cute::_128, cute::_128>,
|
||||
cute::Shape<cute::_2,cute::_2,cute::_1>,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E5M2);
|
||||
EXPECT_TRUE(pass);
|
||||
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,230 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////// Test Batch alpha and beta //////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1cta_s32_batch_alpha_beta, 128x64x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
|
||||
ElementD,
|
||||
ElementCompute,
|
||||
ElementC,
|
||||
ElementBias
|
||||
>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, true, true>(1.0, 1.0); // beta is [1.0, 2.0]
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu_batch_alpha_beta, 128x128x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
|
||||
using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, 0.5); // beta is [0.5, 1.5]
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu__batch_alpha_beta0, 128x128x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
|
||||
using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, -1.0); // beta is [-1.0, 0.0]
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,284 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////// 128x64x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
using ElementC = int8_t;
|
||||
using ElementD = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = int8_t;
|
||||
using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////// 128x64x128 Cluster4x2x1 TMEM 4x1 ////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
using ElementC = int8_t;
|
||||
using ElementD = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = int8_t;
|
||||
using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_4,_2,_1>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////// 64x256x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
using ElementC = int32_t;
|
||||
using ElementD = int32_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
using ElementCompute = int32_t;
|
||||
using ElementBias = int32_t;
|
||||
using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
|
||||
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////// 64x256x128 Cluster2x4x1 TMEM 2x2 ////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4x1) {
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
using ElementC = int8_t;
|
||||
using ElementD = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = int8_t;
|
||||
using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
|
||||
using ClusterShape = Shape<_2,_4,_1>;
|
||||
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
|
||||
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
|
||||
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
|
||||
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, 16 / sizeof(ElementC),
|
||||
ElementD, LayoutC, 16 / sizeof(ElementD),
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, LayoutA, 16 / sizeof(ElementA),
|
||||
ElementB, LayoutB, 16 / sizeof(ElementB),
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using namespace test::gemm::device;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
@ -0,0 +1,293 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/// A Row B Col
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 128,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Row
|
||||
TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 128,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Row B Row
|
||||
TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 128,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Col
|
||||
TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = cutlass::float_e4m3_t;
|
||||
using ElementD = cutlass::float_e4m3_t;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 128,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestAll<Gemm>(1.0, 2.0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,281 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/// A Row B Col
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 4,
|
||||
ElementD, GmemLayoutC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
ElementA, GmemLayoutA, 16,
|
||||
ElementB, GmemLayoutB, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Row
|
||||
TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 4,
|
||||
ElementD, GmemLayoutC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
ElementA, GmemLayoutA, 16,
|
||||
ElementB, GmemLayoutB, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Row B Row
|
||||
TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 4,
|
||||
ElementD, GmemLayoutC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
ElementA, GmemLayoutA, 16,
|
||||
ElementB, GmemLayoutB, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Col
|
||||
TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC, 4,
|
||||
ElementD, GmemLayoutC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
ElementA, GmemLayoutA, 16,
|
||||
ElementB, GmemLayoutB, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,293 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*! \file
|
||||
\brief Tests for device-wide GEMM interface
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "gemm_testbed_3x_ptr_array.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/// A Row B Col
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 16,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Row
|
||||
TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 16,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Row B Row
|
||||
TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::RowMajor;
|
||||
using GmemLayoutB = cutlass::layout::RowMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 16,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
/// A Col B Col
|
||||
TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = void;
|
||||
using ElementD = float;
|
||||
using ElementCompute = float;
|
||||
using ElementAccumulator = float;
|
||||
using ElementSF = cutlass::float_ue8m0_t;
|
||||
using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
|
||||
using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
|
||||
using ElementAccumulator = float;
|
||||
using GmemLayoutA = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutB = cutlass::layout::ColumnMajor;
|
||||
using GmemLayoutC = cutlass::layout::RowMajor;
|
||||
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
|
||||
using ClusterShape_MNK = Shape<_4,_4,_1>;
|
||||
using MmaTileShape_MNK = Shape<_256,_128,_128>;
|
||||
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
|
||||
|
||||
//
|
||||
// Construct CollectiveEpilogue
|
||||
//
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
OutputCtaShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, GmemLayoutC *, 16,
|
||||
ElementD, GmemLayoutC *, 16,
|
||||
cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
//
|
||||
// Construct CollectiveMainloop
|
||||
//
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
|
||||
MmaTypePairA, GmemLayoutA *, 16,
|
||||
MmaTypePairB, GmemLayoutB *, 16,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -87,6 +87,7 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align1, 6
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
|
||||
|
||||
using ElementOutput = float;
|
||||
@ -124,6 +125,8 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 1
|
||||
|
||||
EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
|
||||
}
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -2974,6 +2974,7 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x64x1024_64x64x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3006,8 +3007,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x64x1024_32x32x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3040,8 +3044,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_128x64x1024_64x32x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3074,8 +3081,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x1024x1024_32x64x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3108,8 +3118,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_128x1024x1024_64x64x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3142,8 +3155,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
multicta_256x256x6144_128x1024x1024_64x64x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3176,8 +3192,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
multicta_512x256x6144_256x1024x1024_64x64x1024_16x8x256_3stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3210,8 +3229,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x64x512_64x64x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3244,8 +3266,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x64x512_32x32x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3278,8 +3303,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_128x64x512_64x32x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3312,8 +3340,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_64x128x512_32x64x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3346,8 +3377,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
tensor_op_128x128x512_64x64x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3380,8 +3414,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
multicta_256x256x6144_128x128x512_64x64x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3414,8 +3451,11 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if 0
|
||||
TEST(SM80_gemm_threadblock_crosswise,
|
||||
multicta_512x256x6144_256x128x512_64x64x512_16x8x256_4stage) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
@ -3448,6 +3488,8 @@ TEST(SM80_gemm_threadblock_crosswise,
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM80_gemm_threadblock_congruous,
|
||||
tensor_op_64x64x16_32x64x16_8x8x4_3stage) {
|
||||
|
||||
@ -31,6 +31,7 @@ cutlass_test_unit_add_executable(
|
||||
pipeline_tma_async.cu
|
||||
pipeline_tma_async_warp_specialized.cu
|
||||
pipeline_tma_async_warp_specialized_persistent.cu
|
||||
pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
|
||||
pipeline_async.cu
|
||||
sequence_barrier.cu
|
||||
)
|
||||
|
||||
@ -0,0 +1,381 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief Unit test for the PipelineCLCFetchAsync class
|
||||
*/
|
||||
|
||||
//
|
||||
|
||||
//
|
||||
|
||||
#define KERNEL_DBG_TRACE false
|
||||
|
||||
#include <cuda/atomic>
|
||||
#include "../common/cutlass_unit_test.h"
|
||||
#include <thrust/host_vector.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <cute/tensor.hpp>
|
||||
#include <cute/arch/cluster_sm90.hpp>
|
||||
|
||||
#include <cutlass/util/reference/host/gemm.h>
|
||||
#include <cutlass/cluster_launch.hpp>
|
||||
|
||||
#include "cutlass/core_io.h"
|
||||
#include "cutlass/util/print_error.hpp"
|
||||
#include "cutlass/util/GPU_Clock.hpp"
|
||||
|
||||
#include "testbed_cluster_launch_control.h"
|
||||
#include "cutlass/pipeline/pipeline.hpp"
|
||||
#include "cutlass/arch/barrier.h"
|
||||
#include "cute/arch/cluster_sm90.hpp"
|
||||
#include "cutlass/arch/barrier.h"
|
||||
#include "cutlass/arch/reg_reconfig.h"
|
||||
#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
|
||||
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass;
|
||||
using namespace cutlass::gemm::kernel::detail;
|
||||
|
||||
//////////////////// Shared Memory /////////////////////////
|
||||
|
||||
template <uint32_t Stages, typename ClusterShape>
|
||||
struct SharedStorage
|
||||
{
|
||||
alignas(16) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::CLCResponse clc_response[Stages];
|
||||
alignas(8) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::PipelineStorage storage ;
|
||||
};
|
||||
|
||||
//////////////////// Kernel /////////////////////////
|
||||
template <typename ClusterShape, uint32_t Stages>
|
||||
__launch_bounds__(256, 1)
|
||||
__global__ static
|
||||
void pipeline_device(int *d_workerCount)
|
||||
{
|
||||
extern __shared__ char shared_memory[];
|
||||
|
||||
// single producer, multiple consumers
|
||||
// producer: WG0
|
||||
// consumer: WG1
|
||||
|
||||
using SharedStorage = SharedStorage<Stages, ClusterShape>;
|
||||
using Scheduler = PersistentTileSchedulerSm100<ClusterShape, Stages>;
|
||||
using TileSchedulingPipeline = typename Scheduler::Pipeline;
|
||||
SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
|
||||
|
||||
// Logistics
|
||||
int warp_idx = canonical_warp_idx();
|
||||
auto cluster_shape = ClusterShape{};
|
||||
|
||||
typename TileSchedulingPipeline::Params params;
|
||||
params.transaction_bytes = 16;
|
||||
|
||||
constexpr int NUM_PRODUCER = 32;
|
||||
constexpr int NUM_CONSUMERS_PER_CTA = 32;
|
||||
params.consumer_arv_count = NUM_PRODUCER + NUM_CONSUMERS_PER_CTA * cute::size<0>(cluster_shape) * cute::size<1>(cluster_shape);
|
||||
params.producer_arv_count = 1;
|
||||
// Only the first CTA in the Cluster is producing.
|
||||
params.producer_blockid = 0;
|
||||
|
||||
dim3 block_id_in_cluster = cute::block_id_in_cluster();
|
||||
// mbarrier.init
|
||||
TileSchedulingPipeline scheduler_pipeline(shared_storage.storage, params );
|
||||
Scheduler scheduler(&shared_storage.clc_response[0], typename Scheduler::Params{}, block_id_in_cluster);
|
||||
|
||||
// Ensure All CTAs in Cluster have completed init before issuing commits
|
||||
cute::cluster_arrive_relaxed();
|
||||
cute::cluster_wait();
|
||||
|
||||
uint32_t is_first_block_in_cluster = block_id_in_cluster.x == 0 && block_id_in_cluster.y == 0;
|
||||
int lane_predicate = cute::elect_one_sync();
|
||||
|
||||
uint32_t is_producer = (is_first_block_in_cluster && warp_idx == 0);
|
||||
uint32_t is_consumer = (warp_idx == 4);
|
||||
|
||||
PipelineState<Stages> scheduler_pipe_state;
|
||||
PipelineState<Stages> scheduler_pipe_state_write = cutlass::make_producer_start_state<TileSchedulingPipeline>();
|
||||
typename Scheduler::WorkTileInfo work_tile_info = {
|
||||
static_cast<int32_t>(blockIdx.x),
|
||||
static_cast<int32_t>(blockIdx.y),
|
||||
static_cast<int32_t>(blockIdx.z),
|
||||
false
|
||||
};
|
||||
|
||||
// Persistent loop
|
||||
do {
|
||||
// Producer
|
||||
if (is_producer) {
|
||||
// Only 1 thread of the entire cluster issues the query.
|
||||
scheduler_pipe_state_write = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_state_write);
|
||||
}
|
||||
|
||||
// Consumers
|
||||
if (is_consumer) {
|
||||
int linearCLC = work_tile_info.N_idx * gridDim.x + work_tile_info.M_idx;
|
||||
// Atomically increment the worker count for the linearCLC by 1.
|
||||
if (lane_predicate) {
|
||||
atomicAdd(&d_workerCount[linearCLC], 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Union of all consumers. Note that the producer here is its own consumer.
|
||||
if (is_producer || is_consumer) {
|
||||
scheduler_pipeline.consumer_wait(scheduler_pipe_state);
|
||||
work_tile_info = scheduler.get_current_work(scheduler_pipe_state);
|
||||
scheduler_pipeline.consumer_release(scheduler_pipe_state);
|
||||
++scheduler_pipe_state;
|
||||
|
||||
// Add block offset since the scheduler works at cluster level.
|
||||
dim3 block_id_in_cluster = cute::block_id_in_cluster();
|
||||
work_tile_info.M_idx += block_id_in_cluster.x;
|
||||
work_tile_info.N_idx += block_id_in_cluster.y;
|
||||
work_tile_info.L_idx += block_id_in_cluster.z;
|
||||
|
||||
}
|
||||
} while (work_tile_info.is_valid_tile);
|
||||
|
||||
// End of kernel
|
||||
cute::cluster_sync();
|
||||
}
|
||||
/////////////////////////////////////////////////////
|
||||
|
||||
template<uint32_t Stages_, typename ClusterShape_>
|
||||
struct PipelineTest {
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
static constexpr uint32_t Stages = Stages_;
|
||||
static constexpr uint32_t BlockSize = 128 * 2;
|
||||
using ClusterShape = ClusterShape_;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
bool check_results(int *h_workerCount, int size ) {
|
||||
for (int i = 0 ; i< size; i++ ){
|
||||
if ( h_workerCount[i] != 1 )
|
||||
{
|
||||
std::cout << "linearCLC " << i << " has worker count " << h_workerCount[i] << "\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Run CuTe GEMM kernel
|
||||
cudaError_t run(bool &success, dim3 grid_dim,
|
||||
cudaStream_t stream = 0 ) {
|
||||
|
||||
//
|
||||
// Configure and launch
|
||||
//
|
||||
cudaError_t result;
|
||||
|
||||
int smem_size = 192 * 1024; // 192kB to force 1CTA/SM
|
||||
auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
|
||||
// Launch a single Cluster, with BlockSize threads per CTA
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);
|
||||
dim3 dimGrid = grid_dim;
|
||||
dim3 dimBlock(BlockSize,1,1);
|
||||
|
||||
result = cudaFuncSetAttribute(
|
||||
pipeline_device<
|
||||
decltype(cluster_shape),
|
||||
Stages>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
smem_size
|
||||
);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Error: Failed to set Shared Memory size." << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
int array_size = dimGrid.x * dimGrid.y;
|
||||
int *d_workerCount, *h_workerCount;
|
||||
|
||||
/* Allocate memory. workerCount[i] counts the number of worker(s) which work
|
||||
on linear t i. The expectation is that workerCount[i] == 1 for all i.
|
||||
*/
|
||||
h_workerCount = (int*)malloc(array_size * sizeof(int));
|
||||
|
||||
result = cudaMalloc(&d_workerCount, array_size * sizeof(int));
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to do cudaMalloc." << result << "\n";
|
||||
return result;
|
||||
}
|
||||
|
||||
for(int i = 0 ; i < array_size; i++)
|
||||
{
|
||||
h_workerCount[i] = 0; // Initialize workerCount[i] to 0 for all i.
|
||||
}
|
||||
|
||||
result = cudaMemcpy(d_workerCount, h_workerCount, array_size * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to do cudaMemcpy." << result << "\n";
|
||||
return result;
|
||||
}
|
||||
|
||||
// Extended launch API
|
||||
const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
|
||||
void* kernel_params[] = {&d_workerCount};
|
||||
cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
|
||||
|
||||
result = cudaDeviceSynchronize();
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
result = cudaMemcpy(h_workerCount, d_workerCount, array_size * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to do cudaMemcpy." << result << "\n";
|
||||
return result;
|
||||
}
|
||||
|
||||
success = check_results(h_workerCount, array_size);
|
||||
|
||||
free(h_workerCount);
|
||||
|
||||
result = cudaFree(d_workerCount);
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to do cudaFree." << result << "\n";
|
||||
return result;
|
||||
}
|
||||
|
||||
return cudaSuccess;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
//Cluster1x2 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x2_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster2x1 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x1_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster2x2 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x2_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster1x1 Stage3
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x1_Stage3) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
|
||||
static constexpr uint32_t Stages = 3;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster1x4 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x4_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster4x1 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x1_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster2x4 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x4_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster4x2 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x2_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
|
||||
//Cluster4x4 Stage4
|
||||
TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x4_Stage4) {
|
||||
Options options;
|
||||
options.grid_dim = {32,32,1};
|
||||
using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
|
||||
static constexpr uint32_t Stages = 4;
|
||||
using Test = PipelineTest<Stages, ClusterShape>;
|
||||
Testbed<Test> testbed(options);
|
||||
EXPECT_TRUE(testbed.verification());
|
||||
}
|
||||
#endif
|
||||
154
test/unit/pipeline/testbed_cluster_launch_control.h
Normal file
154
test/unit/pipeline/testbed_cluster_launch_control.h
Normal file
@ -0,0 +1,154 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief Testbed file used by cluster launch control pipeline unit test
|
||||
*/
|
||||
|
||||
//
|
||||
|
||||
//
|
||||
|
||||
#if CUDA_12_0_SM90_FEATURES_SUPPORTED
|
||||
#define CUTLASS_UNIT_TEST_PIPELINE true
|
||||
#else
|
||||
#define CUTLASS_UNIT_TEST_PIPELINE false
|
||||
#endif
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
#include <cutlass/gemm/gemm.h>
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
|
||||
// Command line test options
|
||||
struct Options {
|
||||
//
|
||||
// Data Members
|
||||
//
|
||||
bool help = false;
|
||||
bool verification_enabled = true;
|
||||
int SM_count = 116;
|
||||
int clock_MHz = 1477;
|
||||
dim3 grid_dim = {0,0,0};
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("verification-enabled", verification_enabled, verification_enabled);
|
||||
cmd.get_cmd_line_argument("sm-count", SM_count, SM_count);
|
||||
cmd.get_cmd_line_argument("clock", clock_MHz, clock_MHz);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement.\n\n"
|
||||
<< " --verification-enabled=<bool> Enable/Disable verification\n"
|
||||
<< " --sm-count=<int> Number of SMs on the chip\n"
|
||||
<< " --clock=<int> Locked clock value in Mhz\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Testbed
|
||||
//
|
||||
|
||||
template<typename Pipeline>
|
||||
class Testbed {
|
||||
private:
|
||||
// Commandline options
|
||||
Options options;
|
||||
|
||||
bool run_test() {
|
||||
|
||||
// Run CuTe Gemm
|
||||
Pipeline pipeline;
|
||||
|
||||
bool success = false;
|
||||
cudaError_t result = pipeline.run(success, this->options.grid_dim);
|
||||
|
||||
CUTE_CHECK_LAST();
|
||||
return success;
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
Testbed(Options const &options_) : options(options_) {
|
||||
int device_id = 0;
|
||||
cudaDeviceProp device_prop;
|
||||
CUTE_CHECK_ERROR(cudaSetDevice(device_id));
|
||||
CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
|
||||
|
||||
if (device_prop.major < 1) {
|
||||
fprintf(stderr, "Device does not support CUDA.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Run verification Gemm problem sizes
|
||||
bool verification() {
|
||||
|
||||
#if !defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
printf(
|
||||
"CUTLASS_ARCH_MMA_SM100_SUPPORTED must be set, but it is not. \n"
|
||||
"This test is waived.\n"
|
||||
);
|
||||
return true;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
bool is_success = false;
|
||||
for (int i = 0; i< 10; i++){
|
||||
printf("iteration = %d\n", i);
|
||||
is_success = run_test();
|
||||
if ( not is_success )
|
||||
return is_success;
|
||||
}
|
||||
return is_success;
|
||||
#else
|
||||
// Run the test with single launch
|
||||
return run_test();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user