Files
cutlass/examples/cute/tutorial/hopper/wgmma_sm90.cu
2025-05-13 15:55:29 -04:00

612 lines
21 KiB
Plaintext

/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <cstdlib>
#include <cstdio>
#include <cassert>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <cute/tensor.hpp>
#include "cutlass/cluster_launch.hpp"
#include "cutlass/util/print_error.hpp"
#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/helper_cuda.hpp"
using namespace cute;
template <class ElementA,
class ElementB,
class SmemLayoutA, // (M,K,P)
class SmemLayoutB> // (N,K,P)
struct SharedStorage
{
alignas(128) cute::ArrayEngine<ElementA, cosize_v<SmemLayoutA>> A;
alignas(128) cute::ArrayEngine<ElementB, cosize_v<SmemLayoutB>> B;
};
template <class ProblemShape, class CtaTiler,
class TA, class AStride, class ASmemLayout, class TiledCopyA,
class TB, class BStride, class BSmemLayout, class TiledCopyB,
class TC, class CStride, class TiledMma,
class Alpha, class Beta>
__global__ static
__launch_bounds__(decltype(size(TiledMma{}))::value)
void
gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
TC * C, CStride dC, TiledMma mma,
Alpha alpha, Beta beta)
{
// Preconditions
CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{}); // (M, N, K)
CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{}); // (BLK_M, BLK_N, BLK_K)
CUTE_STATIC_ASSERT_V(size(copy_a) == size(mma)); // NumThreads
CUTE_STATIC_ASSERT_V(size(copy_b) == size(mma)); // NumThreads
static_assert(is_static<ASmemLayout>::value);
static_assert(is_static<BSmemLayout>::value);
CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler)); // BLK_M
CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler)); // BLK_N
CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler)); // BLK_K
CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler)); // BLK_K
CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA)); // dA strides for shape MK
CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB)); // dB strides for shape NK
CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC)); // dC strides for shape MN
//
// Full and Tiled Tensors
//
// Represent the full tensors
Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
// Get the appropriate blocks for this thread block
auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k)
Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k)
Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)
Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N)
// Shared memory tensors
extern __shared__ char shared_memory[];
using SharedStorage = SharedStorage<TA, TB, ASmemLayout, BSmemLayout>;
SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
Tensor sA = make_tensor(make_smem_ptr(smem.A.begin()), ASmemLayout{}); // (BLK_M,BLK_K,PIPE)
Tensor sB = make_tensor(make_smem_ptr(smem.B.begin()), BSmemLayout{}); // (BLK_N,BLK_K,PIPE)
//
// Partition the copying of A and B tiles across the threads
//
ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
Tensor tAgA = thr_copy_a.partition_S(gA); // (CPY,CPY_M,CPY_K,k)
Tensor sA_ = as_position_independent_swizzle_tensor(sA);
Tensor tAsA = thr_copy_a.partition_D(sA_); // (CPY,CPY_M,CPY_K,PIPE)
ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x);
Tensor tBgB = thr_copy_b.partition_S(gB); // (CPY,CPY_N,CPY_K,k)
Tensor sB_ = as_position_independent_swizzle_tensor(sB);
Tensor tBsB = thr_copy_b.partition_D(sB_); // (CPY,CPY_N,CPY_K,PIPE)
CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA)); // CPY_M
CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA)); // CPY_K
CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB)); // CPY_N
CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB)); // CPY_K
//
// PREFETCH
//
// auto K_PIPE_MAX = size<3>(tAsA);
// // Total count of tiles
// int k_tile_count = size<3>(tAgA);
// // Current tile index in gmem to read from
// int k_tile_next = 0;
// // Start async loads for all pipes but the last
// CUTE_UNROLL
// for (int k_pipe = 0; k_pipe < K_PIPE_MAX-1; ++k_pipe) {
// copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,k_pipe));
// copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,k_pipe));
// cp_async_fence();
// --k_tile_count;
// if (k_tile_count > 0) { ++k_tile_next; }
// }
//
// Define A/B partitioning and C accumulators
//
ThrMMA thr_mma = mma.get_slice(threadIdx.x);
Tensor tCsA = thr_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE)
Tensor tCsB = thr_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE)
Tensor tCgC = thr_mma.partition_C(gC); // (MMA,MMA_M,MMA_N)
// Allocate registers for pipelining
Tensor tCrA = thr_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE)
Tensor tCrB = thr_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE)
// Allocate the accumulators -- same size as the projected data
Tensor tCrC = thr_mma.make_fragment_C(tCgC); // (MMA,MMA_M,MMA_N)
CUTE_STATIC_ASSERT_V((size<1>(tCgC) == size<1>(tCsA))); // MMA_M
CUTE_STATIC_ASSERT_V((size<2>(tCgC) == size<1>(tCsB))); // MMA_N
CUTE_STATIC_ASSERT_V((size<2>(tCsA) == size<2>(tCsB))); // MMA_K
// Clear the accumulators
clear(tCrC);
#if 0
if(thread0()) {
print(" mA : "); print( mA); print("\n");
print(" gA : "); print( gA); print("\n");
print(" sA : "); print( sA); print("\n");
print("tAgA : "); print(tAgA); print("\n");
print("tAsA : "); print(tAsA); print("\n");
}
#endif
#if 0
if(thread0()) {
print(" mB : "); print( mB); print("\n");
print(" gB : "); print( gB); print("\n");
print(" sB : "); print( sB); print("\n");
print("tBgB : "); print(tBgB); print("\n");
print("tBsB : "); print(tBsB); print("\n");
}
#endif
#if 0
if(thread0()) {
print(" mC : "); print( mC); print("\n");
print(" gC : "); print( gC); print("\n");
print("tCsA : "); print(tCsA); print("\n");
print("tCsB : "); print(tCsB); print("\n");
print("tCgC : "); print(tCgC); print("\n");
print("tCrA : "); print(tCrA); print("\n");
print("tCrB : "); print(tCrB); print("\n");
print("tCrC : "); print(tCrC); print("\n");
}
#endif
#if 1
// Total number of k-tiles
auto K_TILE_MAX = size<3>(tAgA);
// Number of pipelined k-tiles in smem
auto K_PIPE_MAX = size<3>(tAsA);
//
// PREFETCH
//
// Prefetch all but the last
CUTE_UNROLL
for (int k = 0; k < K_PIPE_MAX-1; ++k)
{
copy(copy_a, tAgA(_,_,_,k), tAsA(_,_,_,k));
copy(copy_b, tBgB(_,_,_,k), tBsB(_,_,_,k));
cp_async_fence();
}
// Clear the accumulators
clear(tCrC);
__syncthreads();
//
// PIPELINED MAIN LOOP
//
// Current pipe to read from
int k_pipe_read = 0;
// Current pipe to write to
int k_pipe_write = K_PIPE_MAX-1;
CUTE_NO_UNROLL
for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
{
int k_tile_next = k_tile + (K_PIPE_MAX-1);
k_tile_next = (k_tile_next >= K_TILE_MAX) ? K_TILE_MAX-1 : k_tile_next;
//
// Copy gmem to smem for k_tile_write
//
copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,k_pipe_write));
copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,k_pipe_write));
cp_async_fence();
// Advance k_pipe_write
++k_pipe_write;
k_pipe_write = (k_pipe_write == K_PIPE_MAX) ? 0 : k_pipe_write;
//
// Compute on k_tile
//
// Wait on all cp.async -- optimize by pipelining to overlap GMEM reads
cp_async_wait<0>();
warpgroup_fence_operand(tCrC);
warpgroup_arrive();
// (V,M,K) x (V,N,K) => (V,M,N)
cute::gemm(mma, tCrA(_,_,_,k_pipe_read), tCrB(_,_,_,k_pipe_read), tCrC);
warpgroup_commit_batch();
/// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
warpgroup_wait<0>();
warpgroup_fence_operand(tCrC);
// Advance k_pipe_read
++k_pipe_read;
k_pipe_read = (k_pipe_read == K_PIPE_MAX) ? 0 : k_pipe_read;
}
#endif
//
// Epilogue
//
axpby(alpha, tCrC, beta, tCgC);
}
// Setup params for a NT GEMM
template <class TA, class TB, class TC,
class Alpha, class Beta>
void
gemm_nt(int m, int n, int k,
Alpha alpha,
TA const* A, int ldA,
TB const* B, int ldB,
Beta beta,
TC * C, int ldC,
cudaStream_t stream = 0)
{
// Define shapes (dynamic)
auto M = int(m);
auto N = int(n);
auto K = int(k);
auto prob_shape = make_shape(M, N, K); // (M, N, K)
// Define NT strides (mixed)
auto dA = make_stride(Int<1>{}, ldA); // (dM, dK)
auto dB = make_stride(Int<1>{}, ldB); // (dN, dK)
auto dC = make_stride(Int<1>{}, ldC); // (dM, dN)
// Define CTA tile sizes (static)
auto bM = Int<128>{};
auto bN = Int<128>{};
auto bK = Int< 64>{};
auto cta_tiler = make_shape(bM, bN, bK); // (BLK_M, BLK_N, BLK_K)
auto bP = Int<3>{}; // Pipeline
// Define the smem layouts (static)
auto sA = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
auto sB = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
// Define the thread layouts (static)
TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TA>{},
Layout<Shape<_16,_8>>{}, // Thr layout 32x4 m-major
Layout<Shape< _8,_1>>{});// Val layout 8x1 m-major
TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TB>{},
Layout<Shape<_16,_8>>{}, // Thr layout 32x4 n-major
Layout<Shape< _8,_1>>{});// Val layout 8x1 n-major
TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::MN,GMMA::Major::MN>{});
#if 0
print(copyA);
print(copyB);
print(mmaC);
#endif
#if 0
print_latex(copyA);
print_latex(copyB);
print_latex(mmaC);
#endif
//
// Setup and Launch
//
// Launch parameter setup
dim3 dimBlock(size(tiled_mma));
dim3 dimCluster(1, 1, 1);
dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
round_up(size(ceil_div(n, bN)), dimCluster.y));
int smemBytes = sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>);
auto* kernel_ptr = &gemm_device<decltype(prob_shape), decltype(cta_tiler),
TA, decltype(dA), decltype(sA), decltype(copyA),
TB, decltype(dB), decltype(sB), decltype(copyB),
TC, decltype(dC), decltype(tiled_mma),
decltype(alpha), decltype(beta)>;
CUTE_CHECK_ERROR(cudaFuncSetAttribute(kernel_ptr,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smemBytes));
// Kernel Launch
cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smemBytes};
cutlass::Status status = cutlass::launch_kernel_on_cluster(params, (void const*) kernel_ptr,
prob_shape, cta_tiler,
A, dA, sA, copyA,
B, dB, sB, copyB,
C, dC, tiled_mma,
alpha, beta);
CUTE_CHECK_LAST();
if (status != cutlass::Status::kSuccess) {
std::cerr << "Error: Failed at kernel Launch" << std::endl;
}
}
// Setup params for a TN GEMM
template <class TA, class TB, class TC,
class Alpha, class Beta>
void
gemm_tn(int m, int n, int k,
Alpha alpha,
TA const* A, int ldA,
TB const* B, int ldB,
Beta beta,
TC * C, int ldC,
cudaStream_t stream = 0)
{
// Define shapes (dynamic)
auto M = int(m);
auto N = int(n);
auto K = int(k);
auto prob_shape = make_shape(M, N, K); // (M, N, K)
// Define TN strides (mixed)
auto dA = make_stride(ldA, Int<1>{}); // (dM, dK)
auto dB = make_stride(ldB, Int<1>{}); // (dN, dK)
auto dC = make_stride(Int<1>{}, ldC); // (dM, dN)
// Define CTA tile sizes (static)
auto bM = Int<128>{};
auto bN = Int<128>{};
auto bK = Int< 64>{};
auto cta_tiler = make_shape(bM, bN, bK); // (BLK_M, BLK_N, BLK_K)
auto bP = Int<3>{}; // Pipeline
// Define the smem layouts (static)
auto sA = tile_to_shape(GMMA::Layout_K_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
auto sB = tile_to_shape(GMMA::Layout_K_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
// Define the thread layouts (static)
TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TA>{},
Layout<Shape<_16,_8>,Stride<_8,_1>>{}, // Thr layout 16x8 k-major
Layout<Shape< _1,_8>>{}); // Val layout 1x8
TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TB>{},
Layout<Shape<_16,_8>,Stride<_8,_1>>{}, // Thr layout 16x8 k-major
Layout<Shape< _1,_8>>{}); // Val layout 1x8
TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::K,GMMA::Major::K>{});
#if 0
print(copyA);
print(copyB);
print(mmaC);
#endif
#if 0
print_latex(copyA);
print_latex(copyB);
print_latex(mmaC);
#endif
//
// Setup and Launch
//
// Launch parameter setup
int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
dim3 dimBlock(size(tiled_mma));
dim3 dimCluster(1, 1, 1);
dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
round_up(size(ceil_div(n, bN)), dimCluster.y));
cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smem_size};
void const* kernel_ptr = reinterpret_cast<void const*>(
&gemm_device<decltype(prob_shape), decltype(cta_tiler),
TA, decltype(dA), decltype(sA), decltype(copyA),
TB, decltype(dB), decltype(sB), decltype(copyB),
TC, decltype(dC), decltype(tiled_mma),
decltype(alpha), decltype(beta)>);
CUTE_CHECK_ERROR(cudaFuncSetAttribute(
kernel_ptr,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size));
// Kernel Launch
cutlass::Status status = cutlass::launch_kernel_on_cluster(params, kernel_ptr,
prob_shape, cta_tiler,
A, dA, sA, copyA,
B, dB, sB, copyB,
C, dC, tiled_mma,
alpha, beta);
CUTE_CHECK_LAST();
if (status != cutlass::Status::kSuccess) {
std::cerr << "Error: Failed at kernel Launch" << std::endl;
}
}
template <class TA, class TB, class TC,
class Alpha, class Beta>
void
gemm(char transA, char transB, int m, int n, int k,
Alpha alpha,
TA const* A, int ldA,
TB const* B, int ldB,
Beta beta,
TC * C, int ldC,
cudaStream_t stream = 0)
{
if (transA == 'N' && transB == 'T') {
return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
} else
if (transA == 'T' && transB == 'N') {
return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
}
assert(false && "Not implemented");
}
int main(int argc, char** argv)
{
cudaDeviceProp props;
int current_device_id;
cudaGetDevice(&current_device_id);
cudaGetDeviceProperties(&props, current_device_id);
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (props.major != 9) {
std::cout << "This example requires NVIDIA's Hopper Architecture GPU with compute capability 90a" << std::endl;
// Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
return 0;
}
#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
int m = 5120;
if (argc >= 2)
sscanf(argv[1], "%d", &m);
int n = 5120;
if (argc >= 3)
sscanf(argv[2], "%d", &n);
int k = 4096;
if (argc >= 4)
sscanf(argv[3], "%d", &k);
char transA = 'N';
if (argc >= 5)
sscanf(argv[4], "%c", &transA);
char transB = 'T';
if (argc >= 6)
sscanf(argv[5], "%c", &transB);
using TA = cute::half_t;
using TB = cute::half_t;
using TC = cute::half_t;
using TI = cute::half_t;
TI alpha = TI(1.0f);
TI beta = TI(0.0f);
thrust::host_vector<TA> h_A(m*k);
thrust::host_vector<TB> h_B(n*k);
thrust::host_vector<TC> h_C(m*n);
// Initialize the tensors
for (int j = 0; j < m*k; ++j) h_A[j] = TA(int((rand() % 2) ? 1 : -1));
for (int j = 0; j < n*k; ++j) h_B[j] = TB(int((rand() % 2) ? 1 : -1));
for (int j = 0; j < m*n; ++j) h_C[j] = TC(0);
thrust::device_vector<TA> d_A = h_A;
thrust::device_vector<TB> d_B = h_B;
thrust::device_vector<TC> d_C = h_C;
double gflops = (2.0*m*n*k) * 1e-9;
const int timing_iterations = 100;
GPU_Clock timer;
int ldA = 0, ldB = 0, ldC = m;
if (transA == 'N') {
ldA = m;
} else if (transA == 'T') {
ldA = k;
} else {
assert(false);
}
if (transB == 'N') {
ldB = k;
} else if (transB == 'T') {
ldB = n;
} else {
assert(false);
}
// Run once
d_C = h_C;
gemm(transA, transB, m, n, k,
alpha,
d_A.data().get(), ldA,
d_B.data().get(), ldB,
beta,
d_C.data().get(), ldC);
CUTE_CHECK_LAST();
thrust::host_vector<TC> cute_result = d_C;
// Timing iterations
timer.start();
for (int i = 0; i < timing_iterations; ++i) {
gemm(transA, transB, m, n, k,
alpha,
d_A.data().get(), ldA,
d_B.data().get(), ldB,
beta,
d_C.data().get(), ldC);
}
double cute_time = timer.seconds() / timing_iterations;
CUTE_CHECK_LAST();
printf("CUTE_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
#else
std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
#endif
return 0;
}