Updates for 3.4 release. (#1305)

This commit is contained in:
ANIKET SHIVAM
2024-01-16 10:42:51 -08:00
committed by GitHub
parent acba5beee5
commit 2f589ffa76
166 changed files with 5996 additions and 4702 deletions

View File

@ -38,9 +38,10 @@
#include <vector>
#include <numeric>
#include <cute/tensor.hpp>
#include <cute/container/bit_field.hpp>
#include <cute/algorithm/tuple_algorithms.hpp>
using namespace cute;
TEST(CuTe_core, Bitfield)

View File

@ -43,26 +43,30 @@ test_complement(Layout const& layout, CoSizeHi const& cosize_hi)
auto result = complement(layout, cosize_hi);
CUTLASS_TRACE_HOST("complement( " << layout << ", " << cosize_hi << ") => " << result);
CUTLASS_TRACE_HOST("complement(" << layout << ", " << cosize_hi << ") => " << result);
// Post-condition on the domain size of the complement (1)
EXPECT_GE( size(result), cosize_hi / size(filter(layout)));
// Post-condition on the codomain size of the complement (2)
EXPECT_LE(cosize(result), cute::ceil_div(cosize_hi, cosize(layout)) * cosize(layout));
auto completed = make_layout(layout, result);
// Lower-bound on the codomain size of the layout ++ complement (1)
EXPECT_GE(cosize(completed), cosize_hi);
// Upper-bound on the codomain size of the complement (2)
EXPECT_LE(cosize(result), cute::round_up(cosize_hi, cosize(layout)));
// Post-condition on the codomain of the complement
for (int i = 1; i < size(result); ++i) {
EXPECT_LT(result(i-1), result(i)); // Ordered (3)
for (int j = 0; j < size(layout); ++j) {
EXPECT_NE(result(i), layout(j)); // Complemented (4)
EXPECT_NE(result(i), layout(j)); // Disjoint (4)
}
}
// Other observations
EXPECT_LE(size(result),cosize(result)); // As a result of the ordered condition (3)
EXPECT_GE(cosize(result), cosize_hi / size(filter(layout))); // As a result of (1) (2) and (5)
if constexpr (is_static<decltype(stride(make_layout(layout,result)))>::value) { // If we can apply complement again
EXPECT_EQ(size(complement(make_layout(layout,result))), 1); // There's no more codomain left over
EXPECT_LE(size(result), cosize(result)); // As a result of the ordered condition (3)
EXPECT_GE(size(result), cosize_hi / size(filter(layout)));
EXPECT_LE(cosize(completed), cosize(result) + cosize(layout));
EXPECT_GE(cosize(result), cosize_hi / size(filter(layout)));
if constexpr (is_static<decltype(stride(completed))>::value) { // If we can apply complement again
EXPECT_EQ(size(complement(completed)), 1); // There's no more codomain left over
}
}
@ -125,6 +129,7 @@ TEST(CuTe_core, Complement)
test_complement(layout, Int<1>{});
test_complement(layout);
test_complement(layout, Int<16>{});
test_complement(layout, Int<19>{});
}
{
@ -153,6 +158,12 @@ TEST(CuTe_core, Complement)
test_complement(layout);
}
{
auto layout = Layout<Shape<_2,_4>, Stride<_1,_6>>{};
test_complement(layout);
}
{
auto layout = Layout<Shape<_2,_4,_8>, Stride<_8,_1,_64>>{};
@ -167,26 +178,34 @@ TEST(CuTe_core, Complement)
}
{
auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
auto layout = make_layout(Shape <Shape <_2,_2>,Shape <_2, _2>>{},
Stride<Stride<_1,_4>,Stride<_8,_32>>{});
test_complement(layout);
}
{
auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
auto layout = make_layout(Shape <Shape <_2, _2>,Shape <_2,_2>>{},
Stride<Stride<_1,_32>,Stride<_8,_4>>{});
test_complement(layout);
}
// Fails due to non-injective input
//{
//auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
// Fails due to non-injective layout
// {
// auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
// Stride<Stride<_1,_8>,Stride<_8,_4>>{});
//test_complement(layout);
//}
// test_complement(layout);
// }
// Fails due to non-injective layout
// {
// auto layout = Layout<Shape<_2,_2>, Stride<_2,_3>>{};
// test_complement(layout);
// test_complement(layout, Int<19>{});
// }
{
auto layout = Layout<Shape<_4,_6>, Stride<_1,_6>>{};

View File

@ -42,8 +42,8 @@ using namespace cute;
template <class LayoutA, class LayoutB>
void
test_composition(const LayoutA& layoutA,
const LayoutB& layoutB)
test_composition(LayoutA const& layoutA,
LayoutB const& layoutB)
{
auto layoutR = composition(layoutA, layoutB);
@ -52,14 +52,12 @@ test_composition(const LayoutA& layoutA,
CUTLASS_TRACE_HOST(" => ");
CUTLASS_TRACE_HOST(layoutR);
// Test that layout R is compatible with layout B
// Test that layout B is compatible with layout R
EXPECT_TRUE(compatible(layoutB, layoutR));
// True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
// Test that R(c) = A(B(c)) for all coordinates c in layoutR
for (int i = 0; i < size(layoutR); ++i) {
EXPECT_EQ(layoutR(i), layoutA(layoutB(i)));
// Test that R(c) = A(B(c)) for all coordinates c in layoutB
for (int c = 0; c < size(layoutB); ++c) {
EXPECT_EQ(layoutR(c), layoutA(layoutB(c)));
}
}

View File

@ -45,10 +45,10 @@ test_logical_divide(LayoutA const& layoutA,
auto layoutR = logical_divide(layoutA, layoutB);
CUTLASS_TRACE_HOST("test_logical_divide()");
CUTLASS_TRACE_HOST(shape(layoutA) << " / " << shape(layoutB) << " => " << shape(layoutR) );
CUTLASS_TRACE_HOST( shape(layoutA) << " / " << shape(layoutB) << " => " << shape(layoutR));
CUTLASS_TRACE_HOST(stride(layoutA) << " " << stride(layoutB) << " => " << stride(layoutR));
// Test that layout R is compatible with layout B
// Test that layout B is compatible with layout R_0
ASSERT_EQ(rank(layoutR), 2);
ASSERT_TRUE(compatible(layoutB, layout<0>(layoutR)));
}
@ -186,10 +186,10 @@ TEST(CuTe_core, Logical_divide)
// Enforcement for dynamic cases
auto result = logical_divide(layout, tile);
static_assert(decltype(shape<0>(result) == Int<32>{})::value);
static_assert(decltype(stride<0>(result) == Int<1>{})::value);
assert(shape<1>(result) == 1);
static_assert(decltype(stride<1>(result) == Int<32>{})::value);
ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
ASSERT_TRUE(shape<1>(result) == 1);
ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
}
{
@ -200,10 +200,10 @@ TEST(CuTe_core, Logical_divide)
// Enforcement for dynamic cases
auto result = logical_divide(layout, tile);
static_assert(decltype(shape<0>(result) == Int<32>{})::value);
static_assert(decltype(stride<0>(result) == Int<1>{})::value);
assert(shape<1>(result) == 2);
static_assert(decltype(stride<1>(result) == Int<32>{})::value);
ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
ASSERT_TRUE(shape<1>(result) == 2);
ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
}
{
@ -221,10 +221,10 @@ TEST(CuTe_core, Logical_divide)
// Enforcement for dynamic cases
auto result = logical_divide(layout, tile);
static_assert(decltype(shape<0>(result) == Int<48>{})::value);
static_assert(decltype(stride<0>(result) == Int<1>{})::value);
assert(shape<1>(result) == 1);
static_assert(decltype(stride<1>(result) == Int<48>{})::value);
ASSERT_TRUE(decltype(shape<0>(result) == Int<48>{})::value);
ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
ASSERT_TRUE(shape<1>(result) == 1);
ASSERT_TRUE(decltype(stride<1>(result) == Int<48>{})::value);
}
// DISALLOWED

View File

@ -46,13 +46,9 @@ test_logical_product(LayoutA const& layoutA,
CUTLASS_TRACE_HOST(shape(layoutA) << " x " << shape(layoutB) << " => " << shape(layoutR) );
CUTLASS_TRACE_HOST(stride(layoutA) << " " << stride(layoutB) << " => " << stride(layoutR));
// Test that layout R is compatible with layout B
ASSERT_EQ(rank(layoutR), 2);
//assert(compatible(layoutB, layout<0>(layoutR)));
//assert(consistent(layoutA, layout<1>(layoutR)));
// True post-condition:
ASSERT_TRUE(layoutA == layout<0>(layoutR));
ASSERT_TRUE(compatible(layoutB, layout<1>(layoutR)));
}
TEST(CuTe_core, Logical_product)

File diff suppressed because it is too large Load Diff

View File

@ -58,7 +58,7 @@ template <
class HostEVTNodeBase {
public:
using Gemm = Gemm_;
using TestBedImpl = typename detail::TestbedImpl<Gemm>;
using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
using Kernel = typename Gemm::GemmKernel;
using Epilogue = typename Kernel::CollectiveEpilogue;
using ElementCompute = typename TestBedImpl::ElementCompute;
@ -238,9 +238,9 @@ public:
_bias.resize(cutlass::Coord<1>(_N));
EXPECT_TRUE(
impl_.initialize_tensor(
detail::initialize_tensor(
_bias.host_view(), cutlass::Distribution::Uniform,
impl_.seed + 2023
impl_.collective_mma_inputs.seed + 2023
)
);
_bias.sync_device();
@ -306,9 +306,9 @@ public:
_bias.resize(cutlass::Coord<1>(_M));
EXPECT_TRUE(
impl_.initialize_tensor(
detail::initialize_tensor(
_bias.host_view(), cutlass::Distribution::Uniform,
impl_.seed + 2023
impl_.collective_mma_inputs.seed + 2023
)
);
_bias.sync_device();
@ -393,10 +393,10 @@ public:
)
);
EXPECT_TRUE(
impl_.initialize_tensor(
detail::initialize_tensor(
_tensor_aux_load.host_view(),
cutlass::Distribution::Uniform,
impl_.seed + 2023
impl_.collective_mma_inputs.seed + 2023
)
);
_tensor_aux_load.sync_device();
@ -1154,7 +1154,7 @@ public:
// The EVT Module to test
using EVTModule = typename EVT::EVTModule;
using TestBedImpl = typename detail::TestbedImpl<Gemm>;
using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
using Kernel = typename Gemm::GemmKernel;
using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
using ElementAccumulator = typename Kernel::ElementAccumulator;
@ -1178,7 +1178,9 @@ public:
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = TestBedImpl::kDefaultSeed
) :
impl_(init_A_, init_B_, init_C_, seed_), check_relative_equality(check_relative_equality_) { }
impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
check_relative_equality(check_relative_equality_) { }
Testbed3xEVT(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
@ -1186,7 +1188,9 @@ public:
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = TestBedImpl::kDefaultSeed
) :
impl_(init_A_, init_B_, init_C_, seed_), check_relative_equality(false) { }
impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
check_relative_equality(false) { }
Testbed3xEVT(
typename LayoutTagA::Stride stride_factor_A_,
@ -1198,15 +1202,10 @@ public:
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = TestBedImpl::kDefaultSeed
) :
impl_(stride_factor_A_,
stride_factor_B_,
stride_factor_C_,
stride_factor_D_,
init_A_,
init_B_,
init_C_,
seed_),
check_relative_equality(false) { }
impl_(stride_factor_A_, stride_factor_B_, stride_factor_C_, stride_factor_D_,
CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
check_relative_equality(false) { }
/// Initializes data structures
void initialize(ProblemShapeType problem_size) {
@ -1229,11 +1228,11 @@ public:
auto K = cute::get<2>(problem_shape_MNKL);
auto L = cute::get<3>(problem_shape_MNKL);
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
auto B = cute::make_tensor(impl_.tensor_B.host_data(),
cute::make_layout(cute::make_shape(N, K, L), impl_.stride_b));
auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d);
auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);
cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
@ -1277,9 +1276,9 @@ public:
<< ", Batch count = " << L << "\n\n";
file
<< "A =\n" << impl_.tensor_A.host_view()
<< "\nB =\n" << impl_.tensor_B.host_view()
<< "\nC =\n" << impl_.tensor_C.host_view() << "\n\n";
<< "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
<< "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
<< "\nC =\n" << impl_.collective_epilogue.tensor_C.host_view() << "\n\n";
file << error_ss.str();
}
@ -1329,15 +1328,15 @@ public:
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
{
impl_.tensor_A.device_data(), impl_.stride_a,
impl_.tensor_B.device_data(), impl_.stride_b
impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
},
{ // Epilogue arguments
{}, // thread
static_cast<ElementC*>(host_reference.get_tensor_C_ptr()),
impl_.stride_c,
impl_.collective_epilogue.stride_c,
static_cast<ElementD*>(host_reference.get_tensor_D_ptr()),
impl_.stride_d
impl_.collective_epilogue.stride_d
}, // Epilogue arguments end
hw_info,
scheduler_args

View File

@ -101,7 +101,8 @@ struct Testbed3xTensorBroadcast {
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = TestBedImpl::kDefaultSeed
) :
impl_(init_A_, init_B_, init_C_, seed_) { }
impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_) { }
Testbed3xTensorBroadcast(
typename LayoutTagA::Stride stride_factor_A_,
@ -117,9 +118,12 @@ struct Testbed3xTensorBroadcast {
stride_factor_B_,
stride_factor_C_,
stride_factor_D_,
CheckEquality::EXACT, ScalarLoc::ON_HOST, VectorBeta::ENABLED,
init_A_,
init_B_,
init_C_,
cutlass::Distribution::Uniform,
cutlass::Distribution::Uniform,
seed_) { }
/// Initializes data structures
@ -135,7 +139,7 @@ struct Testbed3xTensorBroadcast {
auto bias_size = PerColBias ? cute::get<1>(problem_shape_MNKL) : cute::get<0>(problem_shape_MNKL);
bias.resize(cutlass::Coord<1>(bias_size));
EXPECT_TRUE(impl_.initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.seed + 2023));
EXPECT_TRUE(detail::initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2023));
bias.sync_device();
}
@ -147,8 +151,8 @@ struct Testbed3xTensorBroadcast {
auto c_coord = cutlass::make_Coord(M * L, N);
tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.stride_factor_C));
EXPECT_TRUE(impl_.initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.seed + 2024));
tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C));
EXPECT_TRUE(detail::initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2024));
tensor_C1.sync_device();
}
@ -161,19 +165,19 @@ struct Testbed3xTensorBroadcast {
{
auto [M, N, K, L] = problem_shape_MNKL;
impl_.tensor_D.sync_host();
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_A.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_B.host_view()), 0);
impl_.collective_epilogue.tensor_D.sync_host();
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_A.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_B.host_view()), 0);
if (impl_.tensor_D.size() > 1) {
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_D.host_view()), 0);
if (impl_.collective_epilogue.tensor_D.size() > 1) {
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.tensor_D.host_view()), 0);
}
if (impl_.reference_D.size() > 1) {
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.reference_D.host_view()), 0);
if (impl_.collective_epilogue.reference_D.size() > 1) {
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.reference_D.host_view()), 0);
}
bool passed = cutlass::reference::host::TensorEquals(impl_.reference_D.host_view(), impl_.tensor_D.host_view());
bool passed = cutlass::reference::host::TensorEquals(impl_.collective_epilogue.reference_D.host_view(), impl_.collective_epilogue.tensor_D.host_view());
EXPECT_TRUE(passed);
@ -196,12 +200,12 @@ struct Testbed3xTensorBroadcast {
}
file
<< "A =\n" << impl_.tensor_A.host_view()
<< "\nB =\n" << impl_.tensor_B.host_view()
<< "\nC0 =\n" << impl_.tensor_C.host_view()
<< "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
<< "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
<< "\nC0 =\n" << impl_.collective_epilogue.tensor_C.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\n\nReference =\n" << impl_.reference_D.host_view()
<< "\n\nComputed =\n" <<impl_.tensor_D.host_view();
<< "\n\nReference =\n" << impl_.collective_epilogue.reference_D.host_view()
<< "\n\nComputed =\n" <<impl_.collective_epilogue.tensor_D.host_view();
}
return passed;
@ -221,40 +225,39 @@ struct Testbed3xTensorBroadcast {
auto K = cute::get<2>(problem_shape_MNKL);
auto L = cute::get<3>(problem_shape_MNKL);
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
auto B = cute::make_tensor(impl_.tensor_B.host_data(),
cute::make_layout(cute::make_shape(N, K, L), impl_.stride_b));
auto D = cute::make_tensor(impl_.reference_D.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d));
auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
auto D = cute::make_tensor(impl_.collective_epilogue.reference_D.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
auto Bias = cute::make_tensor(static_cast<ElementBias*>(use_bias ? bias.host_data() : nullptr),
cute::make_layout(PerColBias ? cute::make_shape(1, N) : cute::make_shape(M, 1)));
auto C0 = cute::make_tensor(impl_.tensor_C.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
auto C0 = cute::make_tensor(impl_.collective_epilogue.tensor_C.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
auto C1 = cute::make_tensor(tensor_C1.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
// Create host workspace for output of testbed. This computes a portion of the epilogue:
// ref_compute_out = Activation(alpha * (A @ B) + bias)
cutlass::HostTensor<ElementCompute, LayoutTagC> ref_compute_out;
auto c_coord = cutlass::make_Coord(M * L, N);
ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.stride_factor_C), false);
ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C), false);
auto RefComputeOut = cute::make_tensor(ref_compute_out.host_data(),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
// Use a dummy null tensor for operand C because the epilogue overrides C.
auto dummy_C = cute::make_tensor(static_cast<ElementC*>(nullptr),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
ElementCompute dummy_beta(0);
auto dummy_Aux = cute::make_tensor(static_cast<ElementD*>(nullptr),
cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d));
cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
auto dummy_Valpha = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
cute::make_layout(cute::make_shape(M, 1)));
auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
cute::make_layout(cute::make_shape(M, 1)));
cutlass::reference::host::GettEpilogueParams<
ElementScalar,
ElementScalar,
@ -361,17 +364,17 @@ struct Testbed3xTensorBroadcast {
arguments = typename Gemm::Arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
{ impl_.tensor_A.device_data(), impl_.stride_a,
impl_.tensor_B.device_data(), impl_.stride_b,
{ impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b,
impl_.mma_promotion_interval
},
{ // Epilogue arguments
{ alpha, beta }, // ThreadOp arguments
impl_.stride_c,
impl_.tensor_D.device_data(),
impl_.stride_d,
impl_.collective_epilogue.stride_c,
impl_.collective_epilogue.tensor_D.device_data(),
impl_.collective_epilogue.stride_d,
use_bias ? bias.device_data() : nullptr,
impl_.tensor_C.device_data(),
impl_.collective_epilogue.tensor_C.device_data(),
tensor_C1.device_data()
}, // Epilogue arguments end
hw_info

View File

@ -112,6 +112,8 @@ TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64) {
EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM80_Device_Gemm_tf32t_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {
using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@ -132,4 +134,24 @@ TEST(SM80_Device_Gemm_tf32t_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64_profiling) {
using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
cutlass::tfloat32_t, cutlass::layout::RowMajor,
cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
float, cutlass::layout::RowMajor,
float>;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int,int,int,int>,
Config::CollectiveMainloop,
Config::CollectiveEpilogue
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
EXPECT_TRUE(test::gemm::device::TestGemmPerf3x<Gemm>());
}
/////////////////////////////////////////////////////////////////////////////////////////////////
//#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -97,9 +97,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
test::gemm::device::Testbed3x<Gemm, cutlass::epilogue::thread::ReLu> testbed;
bool passed = test::gemm::device::TestAll<Gemm>(1, 1, testbed);
bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
EXPECT_TRUE(passed);
}
@ -156,6 +154,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
#pragma GCC diagnostic pop // Re-enable deprecation warnings
}
TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
@ -239,9 +238,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool check_relative_equality = true;
bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1, check_relative_equality);
using namespace test::gemm::device;
bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
EXPECT_TRUE(passed);
}
@ -600,8 +598,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0, /*check_relative_equality=*/true);
using namespace test::gemm::device;
bool passed = TestAllBiasElementwise<Gemm>(1.0, 0.0, CheckEquality::RELATIVE);
EXPECT_TRUE(passed);
}

View File

@ -97,8 +97,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
test::gemm::device::Testbed3x<Gemm, cutlass::epilogue::thread::ReLu> testbed;
bool passed = test::gemm::device::TestAll<Gemm>(1, 1, testbed);
bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
EXPECT_TRUE(passed);
}
@ -186,8 +185,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
bool check_relative_equality = true;
bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1, check_relative_equality);
using namespace test::gemm::device;
bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
EXPECT_TRUE(passed);
}

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -50,6 +50,7 @@
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_f32, 64x64x16_32x32x16) {
@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_f32, 64x64x16_32x32x16) {
EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
}
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -50,6 +50,7 @@
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
}
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -50,6 +50,7 @@
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
TEST(SM80_Device_Syr2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
}
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -0,0 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/