Collection of changes to fix clang build. (#1200)
* Remove unused variables * Qualify calls to make_fragment_? from templated base class. Fixes clang build error. * Add missing `#include <cstdio>` * Various changes to fix clang compile errors. * More changes to fix clang build. Remaining issues: - `params` initializer of `CollectiveEpilogue`. - `ops` initializer of `Sm90VisitorImplBase`. - `__usAtomicCAS` needs to be added to clang upstream. * Fix remaining clang build issues. * Qualify `cute::rank()` calls. * Qualify some more calls that are otherwise ambiguous between `cute` and `std` namespace. * Double-escape special registers in inline asm. * small change --------- Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
@ -193,8 +193,8 @@ struct TestbedImpl {
|
||||
|
||||
using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
|
||||
|
||||
static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
|
||||
static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
|
||||
static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
|
||||
static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
|
||||
|
||||
static constexpr uint32_t mma_promotion_interval = 4;
|
||||
|
||||
@ -523,9 +523,6 @@ struct TestbedImpl {
|
||||
Gemm& gemm_op,
|
||||
typename Gemm::Arguments& arguments,
|
||||
cutlass::device_memory::allocation<uint8_t>& workspace) {
|
||||
int M = cute::size<0>(problem_size);
|
||||
int N = cute::size<1>(problem_size);
|
||||
int K = cute::size<2>(problem_size);
|
||||
int L = 1;
|
||||
if constexpr(cute::rank(ProblemShapeType{}) == 4) {
|
||||
L = cute::size<3>(problem_size);
|
||||
@ -581,7 +578,7 @@ struct TestbedImpl {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = 0;
|
||||
if (not profiling) {
|
||||
this->sm_count = min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
this->sm_count = std::min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
hw_info.sm_count = this->sm_count;
|
||||
}
|
||||
else {
|
||||
@ -1240,7 +1237,7 @@ struct Testbed3xFusionOperation {
|
||||
|
||||
hw_info.device_id = 0;
|
||||
if (not profiling) {
|
||||
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
hw_info.sm_count = impl_.sm_count;
|
||||
}
|
||||
else {
|
||||
|
||||
@ -173,7 +173,7 @@ public:
|
||||
HostScalarBroadcast(){}
|
||||
template<typename ProblemShapeType, typename TestBedImpl>
|
||||
HostScalarBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
|
||||
:_scalar(ElementCompute(Value)), Base(check_relative_equality) {}
|
||||
: Base(check_relative_equality), _scalar(ElementCompute(Value)) {}
|
||||
|
||||
template <class ElementAccumulator>
|
||||
ElementCompute visit(
|
||||
@ -232,7 +232,7 @@ public:
|
||||
HostRowBroadcast(){}
|
||||
template<typename ProblemShapeType>
|
||||
HostRowBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
|
||||
:impl_(impl), Base(check_relative_equality) {
|
||||
: Base(check_relative_equality), impl_(impl) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
_N = cute::get<1>(problem_shape_MNKL);
|
||||
_bias.resize(cutlass::Coord<1>(_N));
|
||||
@ -300,7 +300,7 @@ public:
|
||||
HostColBroadcast(){}
|
||||
template<typename ProblemShapeType>
|
||||
HostColBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
|
||||
:impl_(impl), Base(check_relative_equality) {
|
||||
: Base(check_relative_equality), impl_(impl) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
_M = cute::get<0>(problem_shape_MNKL);
|
||||
_bias.resize(cutlass::Coord<1>(_M));
|
||||
@ -382,7 +382,7 @@ public:
|
||||
HostAuxLoad(){}
|
||||
template<typename ProblemShapeType>
|
||||
HostAuxLoad(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
|
||||
:impl_(impl), Base(check_relative_equality){
|
||||
: Base(check_relative_equality), impl_(impl){
|
||||
auto problem_shape_NMKL = cute::append<4>(problem_size, 1);
|
||||
auto [_M, _N, K, _L] = problem_shape_NMKL;
|
||||
auto aux_coord = cutlass::make_Coord(_M * _L, _N);
|
||||
@ -513,8 +513,8 @@ public:
|
||||
HostUnaryCompute(){}
|
||||
template <typename ProblemShapeType, typename TestBedImpl>
|
||||
HostUnaryCompute(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
|
||||
_child_0(problem_size, impl, check_relative_equality),
|
||||
Base(check_relative_equality) { }
|
||||
Base(check_relative_equality),
|
||||
_child_0(problem_size, impl, check_relative_equality) { }
|
||||
|
||||
template <class ElementAccumulator>
|
||||
ElementCompute visit(
|
||||
@ -578,8 +578,8 @@ public:
|
||||
HostAuxStore(){}
|
||||
template <typename ProblemShapeType>
|
||||
HostAuxStore(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
|
||||
impl_(impl),
|
||||
Base(check_relative_equality) {
|
||||
Base(check_relative_equality),
|
||||
impl_(impl) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
auto [_M, _N, K, _L] = problem_shape_MNKL;
|
||||
auto aux_coord = cutlass::make_Coord(_M * _L, _N);
|
||||
@ -677,8 +677,8 @@ public:
|
||||
HostRowReduce(){}
|
||||
template <typename ProblemShapeType>
|
||||
HostRowReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
|
||||
impl_(impl),
|
||||
Base(check_relative_equality) {
|
||||
Base(check_relative_equality),
|
||||
impl_(impl) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
_N = cute::get<1>(problem_shape_MNKL);
|
||||
_tensor_row_reduce.resize(cutlass::Coord<1>(_N));
|
||||
@ -764,8 +764,8 @@ public:
|
||||
HostColumnReduce(){}
|
||||
template <typename ProblemShapeType>
|
||||
HostColumnReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
|
||||
impl_(impl),
|
||||
Base(check_relative_equality) {
|
||||
Base(check_relative_equality),
|
||||
impl_(impl) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
_M = cute::get<0>(problem_shape_MNKL);
|
||||
_tensor_column_reduce.resize(cutlass::Coord<1>(_M));
|
||||
@ -850,9 +850,8 @@ public:
|
||||
HostScalarReduce(){}
|
||||
template <typename ProblemShapeType>
|
||||
HostScalarReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
|
||||
impl_(impl),
|
||||
Base(check_relative_equality) {
|
||||
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
|
||||
Base(check_relative_equality),
|
||||
impl_(impl) {
|
||||
_tensor_scalar_reduce.resize(cutlass::Coord<1>(1));
|
||||
_reference_scalar_reduce.resize(cutlass::Coord<1>(1));
|
||||
_reduce_buffer.resize(cutlass::Coord<1>(1));
|
||||
@ -1229,7 +1228,6 @@ public:
|
||||
auto N = cute::get<1>(problem_shape_MNKL);
|
||||
auto K = cute::get<2>(problem_shape_MNKL);
|
||||
auto L = cute::get<3>(problem_shape_MNKL);
|
||||
auto coord_0 = cutlass::make_Coord(0);
|
||||
|
||||
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
|
||||
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
|
||||
@ -1307,7 +1305,7 @@ public:
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = 0;
|
||||
if (not profiling) {
|
||||
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
hw_info.sm_count = impl_.sm_count;
|
||||
}
|
||||
else {
|
||||
|
||||
@ -158,7 +158,6 @@ struct Testbed3xTensorBroadcast {
|
||||
bool use_bias)
|
||||
{
|
||||
auto [M, N, K, L] = problem_shape_MNKL;
|
||||
auto coord_0 = cutlass::make_Coord(0);
|
||||
|
||||
impl_.tensor_D.sync_host();
|
||||
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_A.host_view()), 0);
|
||||
@ -218,7 +217,6 @@ struct Testbed3xTensorBroadcast {
|
||||
auto N = cute::get<1>(problem_shape_MNKL);
|
||||
auto K = cute::get<2>(problem_shape_MNKL);
|
||||
auto L = cute::get<3>(problem_shape_MNKL);
|
||||
auto coord_0 = cutlass::make_Coord(0);
|
||||
|
||||
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
|
||||
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
|
||||
@ -338,7 +336,7 @@ struct Testbed3xTensorBroadcast {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = 0;
|
||||
if (not profiling) {
|
||||
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
|
||||
hw_info.sm_count = impl_.sm_count;
|
||||
}
|
||||
else {
|
||||
|
||||
@ -163,7 +163,7 @@ public:
|
||||
using EVTModule = HEVT<
|
||||
HostAuxStore<Gemm, true>,
|
||||
HEVT<
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>, // activation(Z) * scaled_d
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>, // activation(Z) * scaled_d
|
||||
HEVT<
|
||||
HostCompute<Gemm, ActivationFn>, // activation(Z)
|
||||
HEVT<
|
||||
@ -174,11 +174,11 @@ public:
|
||||
HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
|
||||
HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
|
||||
HostAccumulator<Gemm>,
|
||||
HostColBroadcast<Gemm, ElementD>,
|
||||
HostColBroadcast<Gemm, ElementD>
|
||||
>
|
||||
>
|
||||
>,
|
||||
HostScalarBroadcast<Gemm, 1>, // scale_d
|
||||
HostScalarBroadcast<Gemm, 1> // scale_d
|
||||
>
|
||||
>;
|
||||
};
|
||||
@ -211,26 +211,26 @@ public:
|
||||
HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
|
||||
HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
|
||||
HostAccumulator<Gemm>,
|
||||
HostColBroadcast<Gemm, ElementD>,
|
||||
HostColBroadcast<Gemm, ElementD>
|
||||
>
|
||||
>,
|
||||
// D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
|
||||
HEVT<
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
|
||||
HEVT<
|
||||
HostScalarReduce<Gemm, amax, float>,
|
||||
HEVT<
|
||||
HostCompute<Gemm, ActivationFn>, //activation(Z) * scaled_d
|
||||
HostAccumulator<Gemm>, // Z
|
||||
HostAccumulator<Gemm> // Z
|
||||
>
|
||||
>,
|
||||
HostScalarBroadcast<Gemm, 1>, // scale_d
|
||||
HostScalarBroadcast<Gemm, 1> // scale_d
|
||||
>,
|
||||
// Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
|
||||
HEVT<
|
||||
HostAuxStore<Gemm, false, ElementD, cutlass::layout::RowMajor>,
|
||||
HEVT<
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
|
||||
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
|
||||
HEVT<
|
||||
HostScalarReduce<Gemm, amax, float>,
|
||||
HostAccumulator<Gemm>
|
||||
|
||||
Reference in New Issue
Block a user