update 3.8 v2 (#2112)

* update 3.8 v2

* update 3.8

---------

Co-authored-by: yuzhai <yuzhai@nvidia.com>
This commit is contained in:
Yujia Zhai
2025-02-19 19:03:14 -08:00
committed by GitHub
parent e9627ce55b
commit b84e9802d8
166 changed files with 3986 additions and 4037 deletions

View File

@ -47,11 +47,9 @@ cutlass_test_unit_add_executable(
math.cpp
mixedbits.cpp
nullspace.cpp
packed_tuple.cpp
pointer.cpp
reverse.cpp
swizzle_layout.cpp
transform.cpp
tuple.cpp
tuple_find.cpp
)

View File

@ -1,581 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include "cutlass_unit_test.h"
#include <cutlass/trace.h>
#include <cassert>
#include <cstdint>
#include <tuple>
#include <cute/container/tuple.hpp>
#include <cute/container/packed_tuple.hpp>
#include <cute/algorithm/tuple_algorithms.hpp>
#include <cute/tensor.hpp>
namespace pt_test {
template <class T>
struct Nonempty {
T datum;
Nonempty(T const& t) : datum{t} {}
friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
return lhs.datum == rhs.datum;
}
friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
return !(lhs == rhs);
}
};
template <int V>
struct Empty {
template <int W>
friend bool operator==(Empty<V> const&, Empty<W> const&) {
return V == W;
}
template <int W>
friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
return !(lhs == rhs);
}
};
// std::tuple
static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
#if ! defined(CUTLASS_USE_PACKED_TUPLE)
// cute::tuple
static_assert(cute::is_standard_layout_v<cute::tuple<>>); // it happens to be
static_assert(cute::is_standard_layout_v<cute::tuple<int>>); // it happens to be
static_assert(cute::is_standard_layout_v<cute::tuple<double>>); // it happens to be
static_assert(not cute::is_standard_layout_v<cute::tuple<int, double>>); // it's not
#endif // CUTLASS_USE_PACKED_TUPLE
// cute::packed_tuple
static_assert(cute::is_standard_layout_v<cute::packed_tuple<>>);
static_assert(cute::is_standard_layout_v<cute::packed_tuple<int>>);
static_assert(cute::is_standard_layout_v<cute::packed_tuple<double>>);
static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, double>>); // it is
static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, int, int, int>>); // it is
static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<int, int>, int>>); // it is
static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<Empty<0>, Empty<0>>, int>>); // it is
//////////////////////////////////////////////////////////////////////
// packed_tuple test starts here
//////////////////////////////////////////////////////////////////////
template <
class ExpectedPackedType,
size_t ExpectedPackedSize,
class ... Args>
constexpr void
test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
{
using cute::packed_tuple;
if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
static_assert(cute::is_standard_layout_v<packed_tuple<Args...>>);
}
if constexpr ((cute::is_empty_v<Args> && ...)) {
static_assert(cute::is_empty_v<packed_tuple<Args...>>);
}
static_assert(cute::tuple_size_v<packed_tuple<Args...>> == sizeof...(Args));
auto test_element = [unpacked] (auto index) {
static_assert(cute::is_same_v<
std::tuple_element_t<index, packed_tuple<Args...>>,
std::tuple_element_t<index, std::tuple<Args...>>
>);
packed_tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_packed_tuple(a...); });
EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
};
cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
}
void test_packed_type_aliases() {
using cute::packed_tuple;
test_packed_type_alias<packed_tuple<>, 0>({}, {});
test_packed_type_alias<packed_tuple<int>, 1, int>({7}, {7});
test_packed_type_alias<packed_tuple<double>, 1, double>({1.5}, {1.5});
// Make sure that class types are handled the same as scalar types
test_packed_type_alias<packed_tuple<Nonempty<int>>, 1, Nonempty<int>>(
{Nonempty{7}}, {Nonempty{7}});
test_packed_type_alias<packed_tuple<Nonempty<double>>, 1, Nonempty<double>>(
{Nonempty{1.5}}, {Nonempty{1.5}});
test_packed_type_alias<packed_tuple<>, 0, Empty<0>>({}, {});
test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>>(
{}, {Empty<0>{}, Empty<1>{}});
test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
{}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int>(
{7}, {Empty<0>{}, 7});
test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>>(
{7}, {7, Empty<0>{}});
test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>, Empty<1>>(
{7}, {7, Empty<0>{}, Empty<1>{}});
test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int, Empty<1>>(
{7}, {Empty<0>{}, 7, Empty<1>{}});
test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, Empty<1>, int>(
{7}, {Empty<0>{}, Empty<1>{}, 7});
test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
{7, 1.5}, {7, 1.5, Empty<0>{}});
test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double>(
{7, 1.5}, {7, Empty<0>{}, 1.5});
test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
{7, 1.5}, {7, 1.5, Empty<0>{}});
test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
{7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
{7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
{7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
{7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
{7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
test_packed_type_alias<packed_tuple<int, double, float>, 3, Empty<0>, int, double, float>(
{7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
test_packed_type_alias<packed_tuple<int, double, float>, 3, int, Empty<0>, double, float>(
{7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, Empty<0>, float>(
{7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, float, Empty<0>>(
{7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
}
template <class Tuple, size_t Which, class ExpectedElementType>
constexpr bool test_tuple_element() {
return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
}
void test_tuple_elements() {
using cute::packed_tuple;
static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
static_assert(test_tuple_element<packed_tuple<Empty<0>>, 0, Empty<0>>());
}
// A default-constructible type.
template <size_t Value>
struct DefaultConstructible {};
void test_default_constructibility() {
using cute::packed_tuple;
{
[[maybe_unused]] packed_tuple<> t_p_0;
[[maybe_unused]] packed_tuple<DefaultConstructible<0>> t_p_1;
[[maybe_unused]] packed_tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
[[maybe_unused]] packed_tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
}
}
void test_sizes_and_not_storing_empty_types() {
using cute::packed_tuple;
[[maybe_unused]] packed_tuple<
int,
pt_test::Empty<0>,
double
> pt{42, pt_test::Empty<0>{}, 1.5};
static_assert(cute::is_standard_layout_v<decltype(pt)>);
// packed_result_type must only store the packed tuple,
// and not the integer_sequence(s) used to access it.
// The latter can be represented entirely at compile time as types.
struct { int i; double j; } IntDouble;
static_assert(sizeof(pt) == sizeof(IntDouble));
EXPECT_EQ(cute::get<0>(pt), 42);
EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), 1.5);
packed_tuple<
pt_test::Empty<0>,
pt_test::Empty<1>,
packed_tuple<
pt_test::Empty<0>,
pt_test::Empty<1>,
packed_tuple<pt_test::Empty<0>, packed_tuple<>>
>
> pt_empty{};
static_assert(cute::is_empty_v<decltype(pt_empty)>);
static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
static_assert(sizeof(pt_empty) == 1);
// Template arguments must be default constructible,
// and packed_tuple itself needs a default constructor.
[[maybe_unused]] packed_tuple<
packed_tuple<int, pt_test::Empty<2>>,
double,
pt_test::Empty<3>> pt2;
static_assert(cute::is_standard_layout_v<decltype(pt2)>);
// cute::packed_tuple, like the original cute::tuple, does not
// promise to have working CTAD (constructor template argument
// deduction).
[[maybe_unused]] packed_tuple<
packed_tuple<int, pt_test::Empty<0>>,
pt_test::Empty<1>
> pt3{
packed_tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
pt_test::Empty<1>{}
};
static_assert(cute::is_standard_layout_v<decltype(pt3)>);
static_assert(cute::is_same_v<
cute::tuple_element_t<0, decltype(pt3)>,
packed_tuple<int, pt_test::Empty<0>>>);
static_assert(cute::is_same_v<
cute::tuple_element_t<1, decltype(pt3)>,
pt_test::Empty<1>>);
static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
packed_tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
auto pt3_0_1 = cute::get<1>(pt3_0);
static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
}
} // namespace test
TEST(CuTe_core, PackedTuple2)
{
CUTLASS_TRACE_HOST("-------------------------------");
CUTLASS_TRACE_HOST("packed_tuple");
CUTLASS_TRACE_HOST("-------------------------------");
pt_test::test_packed_type_aliases();
pt_test::test_tuple_elements();
pt_test::test_default_constructibility();
pt_test::test_sizes_and_not_storing_empty_types();
}
TEST(CuTe_core, PackedTuple2Get) {
using cute::packed_tuple;
using pt_test::Empty;
using pt_test::Nonempty;
{
using tuple_type = packed_tuple<int>;
tuple_type pt{42};
static_assert(cute::tuple_size_v<tuple_type> == 1u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<0>(pt) = 43;
EXPECT_EQ(cute::get<0>(pt), 43);
}
{
using tuple_type = packed_tuple<int>;
tuple_type const pt{42};
EXPECT_EQ(cute::get<0>(pt), 42);
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
}
{
EXPECT_EQ(cute::get<0>(packed_tuple<int>{42}), 42);
}
{
using tuple_type = packed_tuple<pt_test::Empty<0>>;
tuple_type pt;
static_assert(cute::tuple_size_v<tuple_type> == 1u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
}
{
using tuple_type = packed_tuple<pt_test::Empty<0>>;
tuple_type const pt;
EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
}
{
using tuple_type = packed_tuple<pt_test::Empty<0>>;
EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
}
{
using tuple_type = packed_tuple<int, double>;
tuple_type pt{1, 2.5};
static_assert(cute::tuple_size_v<tuple_type> == 2u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
EXPECT_EQ(cute::get<0>(pt), 1);
cute::get<0>(pt) = 2;
EXPECT_EQ(cute::get<0>(pt), 2);
EXPECT_EQ(cute::get<1>(pt), 2.5);
cute::get<1>(pt) = 3.5;
EXPECT_EQ(cute::get<1>(pt), 3.5);
}
{
using tuple_type = packed_tuple<int, double>;
tuple_type const pt{1, 2.5};
EXPECT_EQ(cute::get<0>(pt), 1);
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
EXPECT_EQ(cute::get<1>(pt), 2.5);
static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
}
{
using tuple_type = packed_tuple<int, double>;
EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
}
{
using tuple_type = packed_tuple<Empty<0>, double>;
tuple_type pt{Empty<0>{}, 2.5};
static_assert(cute::tuple_size_v<tuple_type> == 2u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
EXPECT_EQ(cute::get<1>(pt), 2.5);
cute::get<1>(pt) = 3.5;
EXPECT_EQ(cute::get<1>(pt), 3.5);
}
{
using tuple_type = packed_tuple<Empty<0>, double>;
tuple_type const pt{Empty<0>{}, 2.5};
EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
EXPECT_EQ(cute::get<1>(pt), 2.5);
static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
}
{
using tuple_type = packed_tuple<Empty<0>, double>;
EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
}
{
using tuple_type = packed_tuple<int, double, Nonempty<float>>;
tuple_type pt{1, 2.5, Nonempty{3.25f}};
static_assert(cute::tuple_size_v<tuple_type> == 3u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), 2.5);
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
cute::get<0>(pt) = 42;
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<1>(pt) = 4.5;
EXPECT_EQ(cute::get<1>(pt), 4.5);
cute::get<2>(pt) = Nonempty<float>{3.75f};
EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
}
{
using tuple_type = packed_tuple<int, double, Nonempty<float>>;
tuple_type const pt{1, 2.5, Nonempty{3.25f}};
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), 2.5);
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
}
{
using tuple_type = packed_tuple<int, double, Nonempty<float>>;
EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
}
{
using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
packed_tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
static_assert(cute::tuple_size_v<tuple_type> == 3u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
cute::get<0>(pt) = 42;
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<2>(pt) = Nonempty<float>{3.75f};
EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
}
{
using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
}
{
using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
}
}
namespace pt_test {
// An empty class type to which Empty is convertible.
template<int Value>
struct ConvertibleFromEmpty {
constexpr ConvertibleFromEmpty() = default;
constexpr ConvertibleFromEmpty(Empty<Value>) {}
template <int OtherValue>
friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
return Value == OtherValue;
}
template <int OtherValue>
friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
return !(lhs == rhs);
}
};
} // end namespace pt_test
TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
// Make sure that packed_tuple's default constructor is constexpr.
// MSVC makes this a bit more challenging than usual.
using pt_test::Empty;
{
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
}
{
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
}
}
TEST(CuTe_core, PackedTupleConvertingConstruction) {
using cute::packed_tuple;
using pt_test::ConvertibleFromEmpty;
using pt_test::Empty;
using pt_test::Nonempty;
{
using tuple_type = cute::tuple<Nonempty<int>>;
[[maybe_unused]] tuple_type t(7);
EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
}
{
using tuple_type = packed_tuple<Nonempty<int>>;
[[maybe_unused]] tuple_type t(7);
EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
[[maybe_unused]] tuple_type t(Empty<0>{});
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
}
{
using tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
[[maybe_unused]] tuple_type t(Empty<0>{});
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
}
{
using tuple_type = cute::tuple<float, Nonempty<int>>;
[[maybe_unused]] tuple_type t(1.5f, 7);
EXPECT_EQ(cute::get<0>(t), 1.5f);
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = packed_tuple<float, Nonempty<int>>;
[[maybe_unused]] tuple_type t(1.5f, 7);
EXPECT_EQ(cute::get<0>(t), 1.5f);
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), Empty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = packed_tuple<Empty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), Empty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = packed_tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using inner_tuple_type = cute::tuple<Empty<0>>;
using outer_tuple_type = cute::tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = packed_tuple<Empty<0>>;
using outer_tuple_type = packed_tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
using outer_tuple_type = cute::tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
using outer_tuple_type = packed_tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
}

View File

@ -32,6 +32,13 @@
#include "cutlass_unit_test.h"
#include <cutlass/trace.h>
#include <cassert>
#include <cstdint>
#include <tuple>
#include <cute/container/tuple.hpp>
#include <cute/algorithm/tuple_algorithms.hpp>
#include <cute/tensor.hpp>
TEST(CuTe_core, Tuple)
@ -120,6 +127,11 @@ TEST(CuTe_core, Tuple)
ASSERT_TRUE(sizeof(tuple_3h_m_type) == 12);
ASSERT_TRUE(!std::is_empty<tuple_3h_m_type>::value);
ASSERT_TRUE(sizeof(cute::tuple<_1, _1, cute::tuple<int32_t>>) == 4);
ASSERT_TRUE(sizeof(cute::tuple<_1, _0, cute::tuple<int32_t>>) == 4);
ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_1, int32_t>>) == 4);
ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_0, int32_t>>) == 4);
CUTLASS_TRACE_HOST("-------------------------------");
CUTLASS_TRACE_HOST("SIMPLE TUPLE OPS");
CUTLASS_TRACE_HOST("-------------------------------");
@ -264,3 +276,588 @@ TEST(CuTe_core, Tuple)
CUTLASS_TRACE_HOST("a(_,1,_,(1,2)) = " << dice(make_coord(_,1,_,make_coord(1,2)), a));
}
}
namespace pt_test {
template <class T>
struct Nonempty {
T datum;
Nonempty(T const& t) : datum{t} {}
friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
return lhs.datum == rhs.datum;
}
friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
return !(lhs == rhs);
}
};
template <int V>
struct Empty {
template <int W>
friend bool operator==(Empty<V> const&, Empty<W> const&) {
return V == W;
}
template <int W>
friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
return !(lhs == rhs);
}
};
// std::tuple
static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
// cute::tuple
static_assert(cute::is_standard_layout_v<cute::tuple<>>);
static_assert(cute::is_standard_layout_v<cute::tuple<int>>);
static_assert(cute::is_standard_layout_v<cute::tuple<double>>);
static_assert(cute::is_standard_layout_v<cute::tuple<int, double>>); // it is
static_assert(cute::is_standard_layout_v<cute::tuple<int, int, int, int>>); // it is
static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<int, int>, int>>); // it is
static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<Empty<0>, Empty<0>>, int>>); // it is
//////////////////////////////////////////////////////////////////////
// tuple test starts here
//////////////////////////////////////////////////////////////////////
template <
class ExpectedPackedType,
size_t ExpectedPackedSize,
class ... Args>
constexpr void
test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
{
using cute::tuple;
if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
static_assert(cute::is_standard_layout_v<tuple<Args...>>);
}
if constexpr ((cute::is_empty_v<Args> && ...)) {
static_assert(cute::is_empty_v<tuple<Args...>>);
}
static_assert(cute::tuple_size_v<tuple<Args...>> == sizeof...(Args));
auto test_element = [unpacked] (auto index) {
static_assert(cute::is_same_v<
std::tuple_element_t<index, tuple<Args...>>,
std::tuple_element_t<index, std::tuple<Args...>>
>);
tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_tuple(a...); });
EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
};
cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
}
void test_packed_type_aliases() {
using cute::tuple;
test_packed_type_alias<tuple<>, 0>({}, {});
test_packed_type_alias<tuple<int>, 1, int>({7}, {7});
test_packed_type_alias<tuple<double>, 1, double>({1.5}, {1.5});
// Make sure that class types are handled the same as scalar types
test_packed_type_alias<tuple<Nonempty<int>>, 1, Nonempty<int>>(
{Nonempty{7}}, {Nonempty{7}});
test_packed_type_alias<tuple<Nonempty<double>>, 1, Nonempty<double>>(
{Nonempty{1.5}}, {Nonempty{1.5}});
test_packed_type_alias<tuple<>, 0, Empty<0>>({}, {});
test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>>(
{}, {Empty<0>{}, Empty<1>{}});
test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
{}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
test_packed_type_alias<tuple<int>, 1, Empty<0>, int>(
{7}, {Empty<0>{}, 7});
test_packed_type_alias<tuple<int>, 1, int, Empty<0>>(
{7}, {7, Empty<0>{}});
test_packed_type_alias<tuple<int>, 1, int, Empty<0>, Empty<1>>(
{7}, {7, Empty<0>{}, Empty<1>{}});
test_packed_type_alias<tuple<int>, 1, Empty<0>, int, Empty<1>>(
{7}, {Empty<0>{}, 7, Empty<1>{}});
test_packed_type_alias<tuple<int>, 1, Empty<0>, Empty<1>, int>(
{7}, {Empty<0>{}, Empty<1>{}, 7});
test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
{7, 1.5}, {7, 1.5, Empty<0>{}});
test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double>(
{7, 1.5}, {7, Empty<0>{}, 1.5});
test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
{7, 1.5}, {7, 1.5, Empty<0>{}});
test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
{7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
{7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
{7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
test_packed_type_alias<tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
{7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
test_packed_type_alias<tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
{7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
test_packed_type_alias<tuple<int, double, float>, 3, Empty<0>, int, double, float>(
{7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
test_packed_type_alias<tuple<int, double, float>, 3, int, Empty<0>, double, float>(
{7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
test_packed_type_alias<tuple<int, double, float>, 3, int, double, Empty<0>, float>(
{7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
test_packed_type_alias<tuple<int, double, float>, 3, int, double, float, Empty<0>>(
{7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
}
template <class Tuple, size_t Which, class ExpectedElementType>
constexpr bool test_tuple_element() {
return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
}
void test_tuple_elements() {
using cute::tuple;
static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
static_assert(test_tuple_element<tuple<Empty<0>>, 0, Empty<0>>());
}
// A default-constructible type.
template <size_t Value>
struct DefaultConstructible {};
void test_default_constructibility() {
using cute::tuple;
{
[[maybe_unused]] tuple<> t_p_0;
[[maybe_unused]] tuple<DefaultConstructible<0>> t_p_1;
[[maybe_unused]] tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
[[maybe_unused]] tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
}
}
void test_sizes_and_not_storing_empty_types() {
using cute::tuple;
[[maybe_unused]] tuple<
int,
pt_test::Empty<0>,
double
> pt{42, pt_test::Empty<0>{}, 1.5};
static_assert(cute::is_standard_layout_v<decltype(pt)>);
// packed_result_type must only store the packed tuple,
// and not the integer_sequence(s) used to access it.
// The latter can be represented entirely at compile time as types.
struct { int i; double j; } IntDouble;
static_assert(sizeof(pt) == sizeof(IntDouble));
EXPECT_EQ(cute::get<0>(pt), 42);
EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), 1.5);
tuple<
pt_test::Empty<0>,
pt_test::Empty<1>,
tuple<
pt_test::Empty<0>,
pt_test::Empty<1>,
tuple<pt_test::Empty<0>, tuple<>>
>
> pt_empty{};
static_assert(cute::is_empty_v<decltype(pt_empty)>);
static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
static_assert(sizeof(pt_empty) == 1);
// Template arguments must be default constructible,
// and tuple itself needs a default constructor.
[[maybe_unused]] tuple<
tuple<int, pt_test::Empty<2>>,
double,
pt_test::Empty<3>> pt2;
static_assert(cute::is_standard_layout_v<decltype(pt2)>);
// cute::tuple, like the original cute::tuple, does not
// promise to have working CTAD (constructor template argument
// deduction).
[[maybe_unused]] tuple<
tuple<int, pt_test::Empty<0>>,
pt_test::Empty<1>
> pt3{
tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
pt_test::Empty<1>{}
};
static_assert(cute::is_standard_layout_v<decltype(pt3)>);
static_assert(cute::is_same_v<
cute::tuple_element_t<0, decltype(pt3)>,
tuple<int, pt_test::Empty<0>>>);
static_assert(cute::is_same_v<
cute::tuple_element_t<1, decltype(pt3)>,
pt_test::Empty<1>>);
static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
auto pt3_0_1 = cute::get<1>(pt3_0);
static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
}
} // namespace test
TEST(CuTe_core, PackedTuple2)
{
CUTLASS_TRACE_HOST("-------------------------------");
CUTLASS_TRACE_HOST("tuple");
CUTLASS_TRACE_HOST("-------------------------------");
pt_test::test_packed_type_aliases();
pt_test::test_tuple_elements();
pt_test::test_default_constructibility();
pt_test::test_sizes_and_not_storing_empty_types();
}
TEST(CuTe_core, PackedTuple2Get) {
using cute::tuple;
using pt_test::Empty;
using pt_test::Nonempty;
{
using tuple_type = tuple<int>;
tuple_type pt{42};
static_assert(cute::tuple_size_v<tuple_type> == 1u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<0>(pt) = 43;
EXPECT_EQ(cute::get<0>(pt), 43);
}
{
using tuple_type = tuple<int>;
tuple_type const pt{42};
EXPECT_EQ(cute::get<0>(pt), 42);
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
}
{
EXPECT_EQ(cute::get<0>(tuple<int>{42}), 42);
}
{
using tuple_type = tuple<pt_test::Empty<0>>;
tuple_type pt;
static_assert(cute::tuple_size_v<tuple_type> == 1u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
}
{
using tuple_type = tuple<pt_test::Empty<0>>;
tuple_type const pt;
EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
}
{
using tuple_type = tuple<pt_test::Empty<0>>;
EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
}
{
using tuple_type = tuple<int, double>;
tuple_type pt{1, 2.5};
static_assert(cute::tuple_size_v<tuple_type> == 2u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
EXPECT_EQ(cute::get<0>(pt), 1);
cute::get<0>(pt) = 2;
EXPECT_EQ(cute::get<0>(pt), 2);
EXPECT_EQ(cute::get<1>(pt), 2.5);
cute::get<1>(pt) = 3.5;
EXPECT_EQ(cute::get<1>(pt), 3.5);
}
{
using tuple_type = tuple<int, double>;
tuple_type const pt{1, 2.5};
EXPECT_EQ(cute::get<0>(pt), 1);
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
EXPECT_EQ(cute::get<1>(pt), 2.5);
static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
}
{
using tuple_type = tuple<int, double>;
EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
}
{
using tuple_type = tuple<Empty<0>, double>;
tuple_type pt{Empty<0>{}, 2.5};
static_assert(cute::tuple_size_v<tuple_type> == 2u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
EXPECT_EQ(cute::get<1>(pt), 2.5);
cute::get<1>(pt) = 3.5;
EXPECT_EQ(cute::get<1>(pt), 3.5);
}
{
using tuple_type = tuple<Empty<0>, double>;
tuple_type const pt{Empty<0>{}, 2.5};
EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
EXPECT_EQ(cute::get<1>(pt), 2.5);
static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
}
{
using tuple_type = tuple<Empty<0>, double>;
EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
}
{
using tuple_type = tuple<int, double, Nonempty<float>>;
tuple_type pt{1, 2.5, Nonempty{3.25f}};
static_assert(cute::tuple_size_v<tuple_type> == 3u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), 2.5);
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
cute::get<0>(pt) = 42;
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<1>(pt) = 4.5;
EXPECT_EQ(cute::get<1>(pt), 4.5);
cute::get<2>(pt) = Nonempty<float>{3.75f};
EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
}
{
using tuple_type = tuple<int, double, Nonempty<float>>;
tuple_type const pt{1, 2.5, Nonempty{3.25f}};
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), 2.5);
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
}
{
using tuple_type = tuple<int, double, Nonempty<float>>;
EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
}
{
using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
static_assert(cute::tuple_size_v<tuple_type> == 3u);
static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
cute::get<0>(pt) = 42;
EXPECT_EQ(cute::get<0>(pt), 42);
cute::get<2>(pt) = Nonempty<float>{3.75f};
EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
}
{
using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
EXPECT_EQ(cute::get<0>(pt), 1);
EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
}
{
using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
}
}
namespace pt_test {
// An empty class type to which Empty is convertible.
template <int Value>
struct ConvertibleFromEmpty {
constexpr ConvertibleFromEmpty() = default;
constexpr ConvertibleFromEmpty(Empty<Value>) {}
template <int OtherValue>
friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
return Value == OtherValue;
}
template <int OtherValue>
friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
return !(lhs == rhs);
}
};
} // end namespace pt_test
TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
// Make sure that tuple's default constructor is constexpr.
// MSVC makes this a bit more challenging than usual.
using pt_test::Empty;
{
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
}
{
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
[[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
[[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
}
}
TEST(CuTe_core, PackedTupleConvertingConstruction) {
using cute::tuple;
using pt_test::ConvertibleFromEmpty;
using pt_test::Empty;
using pt_test::Nonempty;
{
using tuple_type = cute::tuple<Nonempty<int>>;
[[maybe_unused]] tuple_type t(7);
EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
}
{
using tuple_type = tuple<Nonempty<int>>;
[[maybe_unused]] tuple_type t(7);
EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
[[maybe_unused]] tuple_type t(Empty<0>{});
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
}
{
using tuple_type = tuple<ConvertibleFromEmpty<0>>;
[[maybe_unused]] tuple_type t(Empty<0>{});
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
}
{
using tuple_type = cute::tuple<float, Nonempty<int>>;
[[maybe_unused]] tuple_type t(1.5f, 7);
EXPECT_EQ(cute::get<0>(t), 1.5f);
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = tuple<float, Nonempty<int>>;
[[maybe_unused]] tuple_type t(1.5f, 7);
EXPECT_EQ(cute::get<0>(t), 1.5f);
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), Empty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = tuple<Empty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), Empty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using tuple_type = tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
[[maybe_unused]] tuple_type t(Empty<0>{}, 7);
EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
}
{
using inner_tuple_type = cute::tuple<Empty<0>>;
using outer_tuple_type = cute::tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = tuple<Empty<0>>;
using outer_tuple_type = tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
using outer_tuple_type = cute::tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
{
using inner_tuple_type = tuple<ConvertibleFromEmpty<0>>;
using outer_tuple_type = tuple<inner_tuple_type>;
[[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
}
}
namespace test {
template <size_t ExpectedIndex, class X, class Tuple>
void test_tuple_find(Tuple const& t) {
auto index = cute::find<X>(t);
static_assert(decltype(index)::value == ExpectedIndex);
}
template <template <class...> class Tuple>
void test_tuple_find_all() {
using test::test_tuple_find;
using cute::_1;
using cute::_2;
using cute::_4;
test_tuple_find<0, _1>(Tuple<_1>{});
test_tuple_find<1, _2>(Tuple<_1>{});
test_tuple_find<0, int>(Tuple<int>{7});
test_tuple_find<0, _1>(Tuple<_1, _2>{});
test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
test_tuple_find<1, _2>(Tuple<_1, _2>{});
test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
}
} // end namespace test
TEST(CuTe_core, TupleFind)
{
test::test_tuple_find_all<cute::tuple>();
}

View File

@ -1,103 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include "cutlass_unit_test.h"
#include <cutlass/trace.h>
#include <cute/container/packed_tuple.hpp>
#include <cute/container/tuple.hpp>
namespace test {
template<size_t ExpectedIndex, class X, class Tuple>
void test_tuple_find(Tuple const& t) {
auto index = cute::find<X>(t);
static_assert(decltype(index)::value == ExpectedIndex);
}
template<template<class...> class Tuple>
void test_tuple_find_all() {
using test::test_tuple_find;
using cute::_1;
using cute::_2;
using cute::_4;
test_tuple_find<0, _1>(Tuple<_1>{});
test_tuple_find<0, int>(Tuple<int>{7});
test_tuple_find<0, _1>(Tuple<_1, _2>{});
test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
test_tuple_find<1, _2>(Tuple<_1, _2>{});
test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
}
} // end namespace test
TEST(CuTe_core, TupleFind)
{
test::test_tuple_find_all<cute::tuple>();
}
// If cute::tuple is not simply an alias for cute::packed_tuple,
// then test cute::packed_tuple separately.
#if ! defined(CUTLASS_USE_PACKED_TUPLE)
TEST(CuTe_core, PackedTupleFind)
{
test::test_tuple_find_all<cute::packed_tuple>();
}
#endif // CUTLASS_USE_PACKED_TUPLE

View File

@ -53,8 +53,6 @@ private:
template<class Integral, Integral Value>
using IC = std::integral_constant<Integral, Value>;
#if ! defined(CUTLASS_USE_PACKED_TUPLE)
TEST(CuTe_core_msvc_compilation, TupleAssignment)
{
CUTLASS_TRACE_HOST("-------------------------------");
@ -64,30 +62,10 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
using forty_two_type = IC<int, 42>;
using forty_three_type = IC<size_t, 43>;
using ebo_s_type = cute::detail::EBO<0, forty_two_type>;
[[maybe_unused]] ebo_s_type ebo_s;
static_assert(std::is_same_v<decltype(cute::detail::getv(ebo_s)), forty_two_type>);
using ebo_d_type = cute::detail::EBO<1, size_t>;
[[maybe_unused]] ebo_d_type ebo_d(43u);
assert(ebo_d.t_ == 43u);
static_assert(std::is_same_v<std::remove_const_t<std::remove_reference_t<decltype(cute::detail::getv(ebo_d))>>, size_t > );
assert(cute::detail::getv(ebo_d) == 43u);
[[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb0{
41, forty_two_type{}, size_t(43u) };
[[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb1;
int val41 = ConvertibleTo{41};
assert(val41 == 41);
size_t val43 = ConvertibleTo{size_t(43u)};
assert(val43 == size_t{43u});
[[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb2{
ConvertibleTo{41}, forty_two_type{}, ConvertibleTo{size_t(43u)}};
[[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3{ 41 };
[[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3a{ 42 };
tb3 = tb3a;
using tuple_0d_type = cute::tuple<>;
using tuple_1d_d_type = cute::tuple<int>;
@ -106,7 +84,6 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
// 'TupleBase<int, unsigned __int64>' is not a base or member"
t3 = t3a;
}
#endif // CUTLASS_USE_PACKED_TUPLE
TEST(CuTe_core_msvc_compilation, TupleGetSingleInteger)
{

View File

@ -51,12 +51,8 @@ endfunction()
################################################################################
add_subdirectory(sm100_blockscaled_tensorop_gemm)
add_subdirectory(sm100_tensorop_gemm)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_simt
@ -833,7 +829,7 @@ endif()
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array

View File

@ -2204,7 +2204,7 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
// For fp4 and fp6 QMMA kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
// For fp4 and fp6 kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);

View File

@ -30,7 +30,7 @@
#
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
add_custom_target(
cutlass_test_unit_gemm_device_sm100_blockscaled
DEPENDS
@ -57,7 +57,7 @@ cutlass_test_unit_gemm_device_add_executable(
nvf4_nvf4_f16_nvfp4_epilogue.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
BATCH_SOURCES ON
@ -67,7 +67,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf4_mxf4_void_f16_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
BATCH_SOURCES ON
@ -77,7 +77,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf6_mxf6_void_bf16_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
BATCH_SOURCES ON
@ -87,7 +87,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf8_mxf8_void_f8_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
BATCH_SOURCES ON
@ -97,7 +97,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf6_mxf8_void_f32_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
BATCH_SOURCES ON
@ -107,7 +107,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf8_mxf6_f16_f8_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
BATCH_SOURCES ON
@ -117,7 +117,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf4_mxf8_bf16_bf16_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
BATCH_SOURCES ON
@ -127,7 +127,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf8_mxf4_f16_bf16_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
BATCH_SOURCES ON
@ -137,7 +137,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
mxf6_mxf4_f16_f16_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
BATCH_SOURCES ON

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -254,8 +250,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -263,12 +257,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -109,8 +109,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -118,7 +116,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -337,12 +329,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
@ -410,7 +400,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,7 +401,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -180,8 +178,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -189,7 +185,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -259,8 +255,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -268,7 +262,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -334,8 +328,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -343,7 +335,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -413,8 +405,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
@ -422,12 +412,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -495,12 +483,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_192,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_256>;
//
// Construct CollectiveEpilogue
@ -570,12 +556,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -634,8 +620,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_256,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_256>;
//
// Construct CollectiveEpilogue
@ -643,12 +627,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//

View File

@ -92,8 +92,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -101,12 +99,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -165,8 +163,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct CollectiveEpilogue
@ -174,12 +170,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -244,8 +240,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -256,7 +250,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -321,8 +315,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -333,7 +325,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -104,8 +104,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
@ -130,12 +128,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized1Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -195,8 +193,6 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
using MmaTileShape_MNK = Shape<_256,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
@ -220,12 +216,12 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
ElementD, GmemLayoutD, AlignD, // D tensor description
cutlass::epilogue::collective::EpilogueScheduleAuto // Epilogue schedule policy
cutlass::epilogue::TmaWarpSpecialized2Sm // Epilogue schedule policy
>::CollectiveOp;
//
@ -289,8 +285,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
//
// Construct FusionOperation
@ -310,7 +304,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -382,8 +376,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens
using MmaTileShape_MNK = Shape<_128,_128,_256>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_256>;
// Mma's accumulator type
using ElementAccumulator = float;
@ -399,12 +391,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
PerSmTileShape_MNK, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, AlignC,
ElementD, GmemLayoutC, AlignD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
cutlass::epilogue::TmaWarpSpecialized1Sm,
FusionOperation
>::CollectiveOp;

View File

@ -80,17 +80,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_4,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_128,_64>;
using ClusterShape = Shape<_4,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_4,_4,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -80,17 +80,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = cutlass::half_t; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -80,17 +80,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -263,16 +254,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -321,17 +309,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -344,7 +329,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -382,17 +367,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -405,7 +387,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -443,17 +425,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -466,7 +445,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -504,17 +483,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -527,7 +503,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -565,16 +541,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -79,17 +79,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -139,17 +136,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -200,17 +194,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -261,17 +252,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -322,16 +310,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -380,17 +365,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -441,17 +423,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -502,17 +481,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -563,17 +539,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -624,16 +597,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -79,17 +79,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -139,17 +136,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -200,17 +194,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -261,17 +252,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -322,16 +310,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -380,17 +365,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -441,17 +423,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -502,17 +481,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -563,17 +539,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -624,16 +597,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -80,17 +80,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -263,16 +254,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -322,17 +310,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -383,17 +368,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -444,17 +426,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -505,17 +484,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -566,16 +542,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC *, AlignmentC,
@ -587,7 +560,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA *, AlignmentA,
ElementB, LayoutB *, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -80,17 +80,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -383,17 +368,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_64,_128,_64>;
using ClusterShape_MNK = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_64,_128,_64>;
using ClusterShape = Shape<_1,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -444,17 +426,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_128,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_64,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -505,17 +484,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_128,_64,_64>;
using ClusterShape_MNK = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_128,_32,_64>;
using ClusterShape = Shape<_1,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -566,17 +542,14 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_128,_64>;
using ClusterShape_MNK = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_1,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -589,7 +562,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule
@ -627,16 +600,13 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // M
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using TileShape_MNK = Shape<_256,_256,_64>;
using ClusterShape_MNK = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{}));
using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
using MmaTileShape = Shape<_256,_128,_64>;
using ClusterShape = Shape<_2,_2,_1>;
using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; // Kernel to launch
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; // Epilogue to launch
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementAccumulator,
ElementC, LayoutC, AlignmentC,
@ -648,7 +618,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
ElementA, LayoutA, AlignmentA,
ElementB, LayoutB, AlignmentB,
ElementAccumulator,
MmaTileShape, ClusterShape_MNK,
MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule

View File

@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_256,_256>;
using MmaTileShape = cute::Shape<_128,_128,_256>;
using ClusterShape = Shape<_4,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -126,17 +123,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_384,_256>;
using MmaTileShape = cute::Shape<_128,_192,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
@ -179,17 +173,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using MmaTileShape = cute::Shape<_128,_256,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
@ -232,17 +223,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using MmaTileShape = cute::Shape<_256,_128,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,
@ -285,17 +273,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using MmaTileShape = cute::Shape<_256,_192,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 4,

View File

@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_256,_256>;
using MmaTileShape = Shape<_128,_128,_256>;
using ClusterShape = Shape<_4,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_384,_256>;
using MmaTileShape = cute::Shape<_128,_192,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using MmaTileShape = cute::Shape<_128,_256,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using MmaTileShape = cute::Shape<_256,_128,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using MmaTileShape = cute::Shape<_256,_192,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,

View File

@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_128,_128,_256>;
using MmaTileShape = cute::Shape<_128,_128,_256>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_512,_256>;
using MmaTileShape = cute::Shape<_128,_128,_256>;
using ClusterShape = Shape<_2,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using MmaTileShape = cute::Shape<_128,_192,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
using MmaTileShape = cute::Shape<_128,_256,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_256,_256,_256>;
using MmaTileShape = cute::Shape<_256,_128,_256>;
using ClusterShape = Shape<_2,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -338,17 +323,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_512,_256>;
using MmaTileShape = cute::Shape<_256,_128,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -391,17 +373,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_768,_256>;
using MmaTileShape = cute::Shape<_256,_192,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,
@ -444,17 +423,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256
using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
using ClusterTileShape = cute::Shape<_512,_1024,_256>;
using MmaTileShape = cute::Shape<_256,_256,_256>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 4,

View File

@ -69,16 +69,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -118,16 +115,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -167,16 +161,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -216,16 +207,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -265,16 +253,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -314,16 +299,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -363,16 +345,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -412,16 +391,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -461,16 +437,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using ElementD = cutlass::float_e4m3_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
void, LayoutD *, 16 / sizeof(ElementD),

View File

@ -74,16 +74,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -124,16 +121,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -174,16 +168,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -224,16 +215,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -274,16 +262,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -324,16 +309,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_2,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -374,16 +356,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -424,16 +403,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),

View File

@ -71,17 +71,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -126,18 +122,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_4,_2,_1>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -183,17 +174,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x
using ElementAccumulator = int32_t;
using ElementCompute = int32_t;
using ElementBias = int32_t;
using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_1,_1,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),
@ -239,17 +226,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4
using ElementAccumulator = int32_t;
using ElementCompute = float;
using ElementBias = int8_t;
using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
using MmaTileShape = Shape<_128,_256,Int<128 / sizeof(ElementA)>>;
using ClusterShape = Shape<_2,_4,_1>;
using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, LayoutC, 16 / sizeof(ElementC),

View File

@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,

View File

@ -71,10 +71,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -82,7 +80,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
@ -126,10 +124,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -137,7 +133,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
@ -181,10 +177,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -192,7 +186,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,
@ -236,10 +230,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -247,7 +239,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC, 4,

View File

@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
using GmemLayoutA = cutlass::layout::RowMajor;
using GmemLayoutB = cutlass::layout::RowMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,
@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
using GmemLayoutA = cutlass::layout::ColumnMajor;
using GmemLayoutB = cutlass::layout::ColumnMajor;
using GmemLayoutC = cutlass::layout::RowMajor;
using ClusterTileShape_MNK = Shape<_512,_512,_128>;
using ClusterShape_MNK = Shape<_4,_4,_1>;
using MmaTileShape_MNK = Shape<_256,_128,_128>;
using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{}));
using ClusterShape_MNK = Shape<_4,_4,_1>;
//
// Construct CollectiveEpilogue
@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
OutputCtaShape, ClusterShape_MNK,
MmaTileShape_MNK, ClusterShape_MNK,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator, ElementCompute,
ElementC, GmemLayoutC *, 16,

View File

@ -29,7 +29,7 @@
#
#
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
add_custom_target(
cutlass_test_unit_gemm_device_sm100_tensorop
DEPENDS
@ -38,7 +38,7 @@ add_custom_target(
cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_f16xf16
BATCH_SOURCES ON
@ -48,7 +48,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
f16_f16_f16_f16_fusion.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_f8xf8
BATCH_SOURCES ON
@ -58,7 +58,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
f8_f8_f16_f8_fusion.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8
BATCH_SOURCES ON
@ -67,5 +67,6 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
s8_s8_void_s32.cu
s8_s8_s32_s32_fusion.cu
)
endif()
add_subdirectory(narrow_precision)

View File

@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * acc + beta * C + per-row bias
@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * acc + beta * C + per-row bias
@ -193,7 +189,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -264,8 +260,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * acc + beta * C + per-row bias
@ -290,7 +284,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -355,8 +349,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * acc + beta * C + per-row bias
@ -380,7 +372,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -451,8 +443,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// dY = alpha * acc + beta * C
@ -476,7 +466,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -541,8 +531,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// dY = alpha * acc + beta * C
@ -566,7 +554,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm
using MmaTileShape_MNK = Shape<_64,_64,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_64>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s
using MmaTileShape_MNK = Shape<_64,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_64>;
//
// Construct CollectiveEpilogue
@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st
using MmaTileShape_MNK = Shape<_128,_64,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_64>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
//
// Construct CollectiveEpilogue
@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s
using MmaTileShape_MNK = Shape<_128,_64,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_64>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_16,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_64>;
//
// Construct CollectiveEpilogue
@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {
using MmaTileShape_MNK = Shape<_256,_64,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_64>;
//
// Construct CollectiveEpilogue
@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)
using MmaTileShape_MNK = Shape<_256,_256,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_64>;
//
// Construct CollectiveEpilogue
@ -616,7 +600,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -194,7 +190,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -265,8 +261,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -294,7 +288,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -359,8 +353,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
using MmaTileShape_MNK = Shape<_128,_128,_64>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_64>;
// Epilogue fusion operation
// Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -388,7 +380,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using MmaTileShape_MNK = Shape<_64,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
//
// Construct CollectiveEpilogue
@ -91,7 +89,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -155,8 +153,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -167,7 +163,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
//
// Construct CollectiveEpilogue
@ -241,7 +235,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -305,8 +299,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -317,7 +309,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
//
// Construct CollectiveEpilogue
@ -392,7 +382,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -457,8 +447,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_16,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -469,7 +457,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
//
// Construct CollectiveEpilogue
@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -619,7 +603,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -29,7 +29,7 @@
#
#
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
add_custom_target(
cutlass_test_unit_gemm_device_sm100_tensorop_narrow_precision
DEPENDS
@ -38,7 +38,7 @@ add_custom_target(
cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf6f4
BATCH_SOURCES ON
@ -50,7 +50,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
f6f4_f6f4_void_f32_tt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf8
BATCH_SOURCES ON
@ -60,7 +60,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
f6f4_f8_void_f32_nt_layout.cu
)
cutlass_test_unit_gemm_device_add_executable_split_file(
cutlass_test_unit_gemm_device_add_executable(
cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4
BATCH_SOURCES ON
@ -69,3 +69,4 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
f8_f6f4_void_f32_tn_layout.cu
f8_f6f4_void_f32_nt_layout.cu
)
endif()

View File

@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -274,7 +268,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -339,8 +333,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -348,7 +340,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -424,7 +414,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -489,8 +479,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -498,7 +486,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -574,7 +560,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -639,8 +625,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -648,7 +632,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using MmaTileShape_MNK = Shape<_64,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
//
// Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using MmaTileShape_MNK = Shape<_64,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
//
// Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using MmaTileShape_MNK = Shape<_64,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -344,7 +336,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -408,8 +400,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -420,7 +410,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -494,7 +482,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -558,8 +546,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -570,7 +556,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -644,7 +628,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -708,8 +692,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -720,7 +702,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
//
// Construct CollectiveEpilogue
@ -794,7 +774,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -858,8 +838,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -870,7 +848,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -944,7 +920,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1008,8 +984,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1020,7 +994,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1170,7 +1140,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1235,8 +1205,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -1244,7 +1212,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using MmaTileShape_MNK = Shape<_64,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -573,7 +559,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -638,8 +624,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using MmaTileShape_MNK = Shape<_64,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
//
// Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -263,8 +259,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using MmaTileShape_MNK = Shape<_64,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
//
// Construct CollectiveEpilogue
@ -272,7 +266,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -336,8 +330,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using MmaTileShape_MNK = Shape<_64,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -345,7 +337,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -409,8 +401,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -495,7 +483,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -559,8 +547,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -636,8 +622,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -645,7 +629,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -709,8 +693,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -786,8 +768,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
//
// Construct CollectiveEpilogue
@ -795,7 +775,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -859,8 +839,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
//
// Construct CollectiveEpilogue
@ -868,7 +846,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -932,8 +910,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -941,7 +917,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1005,8 +981,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1017,7 +991,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1082,8 +1056,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -1091,7 +1063,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1155,8 +1127,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1232,8 +1202,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -1241,7 +1209,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using MmaTileShape_MNK = Shape<_64,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
//
// Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using MmaTileShape_MNK = Shape<_64,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
//
// Construct CollectiveEpilogue
@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using MmaTileShape_MNK = Shape<_64,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
//
// Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using MmaTileShape_MNK = Shape<_64,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
//
// Construct CollectiveEpilogue
@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -712,8 +696,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
//
// Construct CollectiveEpilogue
@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -797,7 +777,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -862,8 +842,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using MmaTileShape_MNK = Shape<_128,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_192,_128>;
//
// Construct CollectiveEpilogue
@ -871,7 +849,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using MmaTileShape_MNK = Shape<_128,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -947,7 +923,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1012,8 +988,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
//
// Construct CollectiveEpilogue
@ -1021,7 +995,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using MmaTileShape_MNK = Shape<_256,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using MmaTileShape_MNK = Shape<_256,_192,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_192,_128>;
//
// Construct CollectiveEpilogue
@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -1231,8 +1201,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1243,7 +1211,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Epilogue fusion operation
// Z = per-row alpha * acc + per-row beta * C + per-row bias
@ -101,7 +99,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -166,8 +164,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_2,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
// Epilogue fusion operation
// Z = per-col alpha * acc + per-col beta * C + per-col bias
@ -185,7 +181,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
//
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_
using MmaTileShape_MNK = Shape<_64,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm
using MmaTileShape_MNK = Shape<_64,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
//
// Construct CollectiveEpilogue
@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_1,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_8,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_128,_128>;
//
// Construct CollectiveEpilogue
@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm
using MmaTileShape_MNK = Shape<_128,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_4,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2
using MmaTileShape_MNK = Shape<_128,_128,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_16,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_64,_128,_128>;
//
// Construct CollectiveEpilogue
@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str
using MmaTileShape_MNK = Shape<_256,_64,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_4,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_64,_128>;
// Tile Scheduler
using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -546,7 +532,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description
@ -611,8 +597,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {
using MmaTileShape_MNK = Shape<_256,_256,_128>;
// Cluster size for multicast
using ClusterShape_MNK = Shape<_2,_1,_1>;
// Collective Epilogue takes the output tile shape for 1 CTA
using PerSmTileShape_MNK = Shape<_128,_256,_128>;
//
// Construct CollectiveEpilogue
@ -620,7 +604,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, // Arch and Tensorop spec
PerSmTileShape_MNK, ClusterShape_MNK, // Epilogue tile shape, and cluster shape
MmaTileShape_MNK, ClusterShape_MNK, // Mma instruction tile shape, cluster shape
cutlass::epilogue::collective::EpilogueTileAuto, // Epilogue subtile shape. Auto will find a suitable tile shape
ElementAccumulator, ElementCompute, // Mma instr's accumulator type and compute precision for epilogue
ElementC, GmemLayoutC, AlignC, // C tensor description

View File

@ -26,12 +26,21 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_test_unit_add_executable(
cutlass_test_unit_pipeline
set(PIPELINE_SOURCES
pipeline_tma_async.cu
pipeline_tma_async_warp_specialized.cu
pipeline_tma_async_warp_specialized_persistent.cu
pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
pipeline_async.cu
sequence_barrier.cu
)
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
list(APPEND PIPELINE_SOURCES
pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
)
endif()
cutlass_test_unit_add_executable(
cutlass_test_unit_pipeline
${PIPELINE_SOURCES}
)