Collection of changes to fix clang build. (#1200)

* Remove unused variables

* Qualify calls to make_fragment_? from templated base class.

Fixes clang build error.

* Add missing `#include <cstdio>`

* Various changes to fix clang compile errors.

* More changes to fix clang build.

Remaining issues:

- `params` initializer of `CollectiveEpilogue`.
- `ops` initializer of `Sm90VisitorImplBase`.
- `__usAtomicCAS` needs to be added to clang upstream.

* Fix remaining clang build issues.

* Qualify `cute::rank()` calls.

* Qualify some more calls that are otherwise ambiguous between `cute` and `std` namespace.

* Double-escape special registers in inline asm.

* small change

---------

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
Christian Sigg
2023-12-08 20:42:12 +01:00
committed by GitHub
parent f4a0216601
commit e1483d5fa0
46 changed files with 308 additions and 273 deletions

View File

@ -193,8 +193,8 @@ struct TestbedImpl {
using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
static constexpr uint32_t mma_promotion_interval = 4;
@ -523,9 +523,6 @@ struct TestbedImpl {
Gemm& gemm_op,
typename Gemm::Arguments& arguments,
cutlass::device_memory::allocation<uint8_t>& workspace) {
int M = cute::size<0>(problem_size);
int N = cute::size<1>(problem_size);
int K = cute::size<2>(problem_size);
int L = 1;
if constexpr(cute::rank(ProblemShapeType{}) == 4) {
L = cute::size<3>(problem_size);
@ -581,7 +578,7 @@ struct TestbedImpl {
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = 0;
if (not profiling) {
this->sm_count = min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
this->sm_count = std::min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
hw_info.sm_count = this->sm_count;
}
else {
@ -1240,7 +1237,7 @@ struct Testbed3xFusionOperation {
hw_info.device_id = 0;
if (not profiling) {
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
hw_info.sm_count = impl_.sm_count;
}
else {

View File

@ -173,7 +173,7 @@ public:
HostScalarBroadcast(){}
template<typename ProblemShapeType, typename TestBedImpl>
HostScalarBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
:_scalar(ElementCompute(Value)), Base(check_relative_equality) {}
: Base(check_relative_equality), _scalar(ElementCompute(Value)) {}
template <class ElementAccumulator>
ElementCompute visit(
@ -232,7 +232,7 @@ public:
HostRowBroadcast(){}
template<typename ProblemShapeType>
HostRowBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
:impl_(impl), Base(check_relative_equality) {
: Base(check_relative_equality), impl_(impl) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
_N = cute::get<1>(problem_shape_MNKL);
_bias.resize(cutlass::Coord<1>(_N));
@ -300,7 +300,7 @@ public:
HostColBroadcast(){}
template<typename ProblemShapeType>
HostColBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
:impl_(impl), Base(check_relative_equality) {
: Base(check_relative_equality), impl_(impl) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
_M = cute::get<0>(problem_shape_MNKL);
_bias.resize(cutlass::Coord<1>(_M));
@ -382,7 +382,7 @@ public:
HostAuxLoad(){}
template<typename ProblemShapeType>
HostAuxLoad(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
:impl_(impl), Base(check_relative_equality){
: Base(check_relative_equality), impl_(impl){
auto problem_shape_NMKL = cute::append<4>(problem_size, 1);
auto [_M, _N, K, _L] = problem_shape_NMKL;
auto aux_coord = cutlass::make_Coord(_M * _L, _N);
@ -513,8 +513,8 @@ public:
HostUnaryCompute(){}
template <typename ProblemShapeType, typename TestBedImpl>
HostUnaryCompute(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
_child_0(problem_size, impl, check_relative_equality),
Base(check_relative_equality) { }
Base(check_relative_equality),
_child_0(problem_size, impl, check_relative_equality) { }
template <class ElementAccumulator>
ElementCompute visit(
@ -578,8 +578,8 @@ public:
HostAuxStore(){}
template <typename ProblemShapeType>
HostAuxStore(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
impl_(impl),
Base(check_relative_equality) {
Base(check_relative_equality),
impl_(impl) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
auto [_M, _N, K, _L] = problem_shape_MNKL;
auto aux_coord = cutlass::make_Coord(_M * _L, _N);
@ -677,8 +677,8 @@ public:
HostRowReduce(){}
template <typename ProblemShapeType>
HostRowReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
impl_(impl),
Base(check_relative_equality) {
Base(check_relative_equality),
impl_(impl) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
_N = cute::get<1>(problem_shape_MNKL);
_tensor_row_reduce.resize(cutlass::Coord<1>(_N));
@ -764,8 +764,8 @@ public:
HostColumnReduce(){}
template <typename ProblemShapeType>
HostColumnReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
impl_(impl),
Base(check_relative_equality) {
Base(check_relative_equality),
impl_(impl) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
_M = cute::get<0>(problem_shape_MNKL);
_tensor_column_reduce.resize(cutlass::Coord<1>(_M));
@ -850,9 +850,8 @@ public:
HostScalarReduce(){}
template <typename ProblemShapeType>
HostScalarReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
impl_(impl),
Base(check_relative_equality) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
Base(check_relative_equality),
impl_(impl) {
_tensor_scalar_reduce.resize(cutlass::Coord<1>(1));
_reference_scalar_reduce.resize(cutlass::Coord<1>(1));
_reduce_buffer.resize(cutlass::Coord<1>(1));
@ -1229,7 +1228,6 @@ public:
auto N = cute::get<1>(problem_shape_MNKL);
auto K = cute::get<2>(problem_shape_MNKL);
auto L = cute::get<3>(problem_shape_MNKL);
auto coord_0 = cutlass::make_Coord(0);
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
@ -1307,7 +1305,7 @@ public:
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = 0;
if (not profiling) {
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
hw_info.sm_count = impl_.sm_count;
}
else {

View File

@ -158,7 +158,6 @@ struct Testbed3xTensorBroadcast {
bool use_bias)
{
auto [M, N, K, L] = problem_shape_MNKL;
auto coord_0 = cutlass::make_Coord(0);
impl_.tensor_D.sync_host();
EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_A.host_view()), 0);
@ -218,7 +217,6 @@ struct Testbed3xTensorBroadcast {
auto N = cute::get<1>(problem_shape_MNKL);
auto K = cute::get<2>(problem_shape_MNKL);
auto L = cute::get<3>(problem_shape_MNKL);
auto coord_0 = cutlass::make_Coord(0);
auto A = cute::make_tensor(impl_.tensor_A.host_data(),
cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
@ -338,7 +336,7 @@ struct Testbed3xTensorBroadcast {
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = 0;
if (not profiling) {
impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
hw_info.sm_count = impl_.sm_count;
}
else {

View File

@ -163,7 +163,7 @@ public:
using EVTModule = HEVT<
HostAuxStore<Gemm, true>,
HEVT<
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>, // activation(Z) * scaled_d
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>, // activation(Z) * scaled_d
HEVT<
HostCompute<Gemm, ActivationFn>, // activation(Z)
HEVT<
@ -174,11 +174,11 @@ public:
HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
HostAccumulator<Gemm>,
HostColBroadcast<Gemm, ElementD>,
HostColBroadcast<Gemm, ElementD>
>
>
>,
HostScalarBroadcast<Gemm, 1>, // scale_d
HostScalarBroadcast<Gemm, 1> // scale_d
>
>;
};
@ -211,26 +211,26 @@ public:
HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
HostAccumulator<Gemm>,
HostColBroadcast<Gemm, ElementD>,
HostColBroadcast<Gemm, ElementD>
>
>,
// D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
HEVT<
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
HEVT<
HostScalarReduce<Gemm, amax, float>,
HEVT<
HostCompute<Gemm, ActivationFn>, //activation(Z) * scaled_d
HostAccumulator<Gemm>, // Z
HostAccumulator<Gemm> // Z
>
>,
HostScalarBroadcast<Gemm, 1>, // scale_d
HostScalarBroadcast<Gemm, 1> // scale_d
>,
// Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
HEVT<
HostAuxStore<Gemm, false, ElementD, cutlass::layout::RowMajor>,
HEVT<
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
HEVT<
HostScalarReduce<Gemm, amax, float>,
HostAccumulator<Gemm>