CUTLASS 2.0 (#62)

CUTLASS 2.0

Substantially refactored for

- Better performance, particularly for native Turing Tensor Cores
- Robust and durable templates spanning the design space
- Encapsulated functionality embodying modern C++11 programming techniques
- Optimized containers and data types for efficient, generic, portable device code

Updates to:
- Quick start guide
- Documentation
- Utilities
- CUTLASS Profiler

Native Turing Tensor Cores
- Efficient GEMM kernels targeting Turing Tensor Cores
- Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands

Coverage of existing CUTLASS functionality:
- GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs
- Volta Tensor Cores through native mma.sync and through WMMA API
- Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions
- Batched GEMM operations
- Complex-valued GEMMs

Note: this commit and all that follow require a host compiler supporting C++11 or greater.
This commit is contained in:
Andrew Kerr
2019-11-19 16:55:34 -08:00
committed by GitHub
parent b5cab177a9
commit fb335f6a5f
5434 changed files with 599799 additions and 250176 deletions

View File

@ -0,0 +1,33 @@
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_threadblock
mma_pipelined_wmma_sm70.cu
mma_pipelined_wmma_sm75.cu
mma_singlestage_wmma_sm70.cu
mma_singlestage_wmma_sm75.cu
mma_pipelined_sm70.cu
mma_pipelined_sm75.cu
mma_pipelined_simt.cu
)

View File

@ -0,0 +1,640 @@
/***************************************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for threadblock level GEMV
*/
#include "../../common/cutlass_unit_test.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/tensor_ref.h"
#include "cutlass/core_io.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/gemm/threadblock/gemv.h"
#include "cutlass/gemm/threadblock/default_gemv_core.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace test {
namespace gemm {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename Gemv, typename LongIndex, typename RefA, typename RefB, typename RefC>
__global__ void batched_gemv_threadblock_test_kernel(
cutlass::gemm::GemmCoord problem_size,
LongIndex stride_a,
LongIndex stride_b,
LongIndex stride_c,
RefA ref_A,
RefB ref_B,
RefC ref_C
) {
typename Gemv::IteratorA::TensorCoord threadblock_offset_A(0, 0);
typename Gemv::IteratorB::TensorCoord threadblock_offset_B(0, 0);
typename Gemv::IteratorB::TensorCoord threadblock_offset_C(0, 0);
// Move to the right batches for these threads
ref_A.add_pointer_offset(threadIdx.y * stride_a);
ref_B.add_pointer_offset(threadIdx.y * stride_b);
ref_C.add_pointer_offset(threadIdx.y * stride_c);
// Construct iterators to A and B operands
typename Gemv::IteratorA::Params params_A(ref_A.layout());
typename Gemv::IteratorA iterator_A(params_A, ref_A.data(), { problem_size.m(), problem_size.k() }, 0, threadblock_offset_A);
typename Gemv::IteratorB::Params params_B(ref_B.layout());
typename Gemv::IteratorB iterator_B(params_B, ref_B.data(), { problem_size.k(), problem_size.n() }, threadIdx.x, threadblock_offset_B);
Gemv gemv;
typename Gemv::FragmentC accum;
accum.clear();
// Compute threadblock-scoped matrix multiply-add
gemv(problem_size, accum, iterator_A, iterator_B, accum);
// IteratorC is PitchLinear<> assumes n() contiguous
typename Gemv::IteratorC::Params params_C(ref_C.layout());
typename Gemv::IteratorC iterator_C(params_C, ref_C.data(), { problem_size.m(), problem_size.n() }, threadIdx.x, threadblock_offset_C);
iterator_C.store(accum);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
template<typename Shape_,
typename ElementAB_,
typename ElementC_,
typename LayoutA_,
typename LayoutB_,
typename LayoutC_,
int LDG_N,
int LDG_K,
int MAX_THREADS_PER_BLOCK=512,
bool DEBUG=false>
void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int num_batch)
{
using Shape = Shape_;
using ElementA = ElementAB_;
using LayoutA = LayoutA_;
using ElementB = ElementAB_;
using LayoutB = LayoutB_;
using ElementC = ElementC_;
using LayoutC = LayoutC_;
using ThreadShape = cutlass::gemm::GemmShape<1, LDG_N, LDG_K>;
using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
Shape,
ThreadShape,
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC
>;
if (DEBUG)
{
num_batch = 1;
}
using Mma = cutlass::gemm::threadblock::Gemv<Core>;
// Create host tensors that will be the backing store for the batches
// Note that no device memory is initially allocated
cutlass::HostTensor<ElementA, LayoutA> matrix_A({problem_size.m(), problem_size.k()}, false);
cutlass::HostTensor<ElementB, LayoutB> matrix_B({problem_size.k(), problem_size.n()}, false);
cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed({problem_size.m(), problem_size.n()}, false);
cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference({problem_size.m(), problem_size.n()}, false);
// Reserve memory for the batch of tensors
matrix_A.reserve(problem_size.m()*problem_size.k()*num_batch);
matrix_B.reserve(problem_size.n()*problem_size.k()*num_batch);
matrix_C_computed.reserve(problem_size.m()*problem_size.n()*num_batch);
matrix_C_reference.reserve(problem_size.m()*problem_size.n()*num_batch, false);
// Fill eatch tensor batch
const int seed = 6834;
for (int b = 0; b < num_batch; b++)
{
if(DEBUG)
{
cutlass::reference::host::BlockFillSequential(
matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity());
cutlass::reference::host::BlockFillSequential(
matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity());
}
else
{
cutlass::reference::host::TensorFillRandomUniform(
matrix_A.host_view(b*matrix_A.capacity()),
seed + 1660,
8,
-8,
0
);
cutlass::reference::host::TensorFillRandomUniform(
matrix_B.host_view(b*matrix_B.capacity()),
seed + 1880,
8,
-8,
0
);
}
cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity()));
cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
}
matrix_A.sync_device();
matrix_B.sync_device();
matrix_C_computed.sync_device();
dim3 grid(1, 1); // only 1 CTA is used
dim3 block(Shape::kN / LDG_N, num_batch, 1);
#if 0
printf("block dim = %d x %d\n", block.x, block.y);
#endif
// Some sanity checks
EXPECT_TRUE( problem_size.n() % LDG_N == 0 );
EXPECT_TRUE( block.x*block.y <= MAX_THREADS_PER_BLOCK );
test::gemm::threadblock::batched_gemv_threadblock_test_kernel<Mma><<< grid, block >>>(
problem_size,
matrix_A.capacity(),
matrix_B.capacity(),
matrix_C_computed.capacity(),
matrix_A.device_ref(),
matrix_B.device_ref(),
matrix_C_computed.device_ref()
);
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);
matrix_C_computed.sync_host();
// Compute the batched gemms
for (int b = 0; b < num_batch; b++)
{
cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
ElementC, LayoutC, ElementC, ElementC> reference_gemm;
reference_gemm(
problem_size.mnk(),
ElementC(1),
matrix_A.host_ref(b*matrix_A.capacity()),
matrix_B.host_ref(b*matrix_B.capacity()),
ElementC(0),
matrix_C_reference.host_ref(b*matrix_C_computed.capacity())
);
bool passed = cutlass::reference::host::TensorEquals(
matrix_C_computed.host_view(b*matrix_C_computed.capacity()),
matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
EXPECT_TRUE(passed)
//<< "A:\n" << matrix_A.host_view() << "\n"
//<< "B:\n" << matrix_B.host_view() << "\n"
<< "Batch: " << b << "\n"
<< "Reference:\n" << matrix_C_reference.host_view(b*matrix_C_reference.capacity()) << "\n"
<< "Computed:\n" << matrix_C_computed.host_view(b*matrix_C_computed.capacity()) << "\n";
}
}
} // namespace threadblock
} // namespace gemm
} // namespace test
/////////////////////////////////////////////////////////////////////////////////////////////////
// A: ColumnMajor
// B: RowMajor
// C: ColumnMajor
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp32_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_crc_fp32_fp32_4N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
const int num_batch = 5;
const int LDG_N = 4;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp32_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
float, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_8K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 8;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp16_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_i8_i32_2N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_i8_i32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
// A: RowMajor
// B: ColumnMajor
// C: RowMajor
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp32_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcr_fp32_fp32_4N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
const int num_batch = 5;
const int LDG_N = 4;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp32_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_8K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 8;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp16_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_i8_i32_2N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_i8_i32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::RowMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
// A: RowMajor
// B: ColumnMajor
// C: ColumnMajor
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp32_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcc_fp32_fp32_4N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
const int num_batch = 5;
const int LDG_N = 4;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape, float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp32_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
float, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_2K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 2;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_8K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 8;
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp16_fp32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
cutlass::half_t, float,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_i8_i32_2N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
const int num_batch = 4;
const int LDG_N = 2;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_i8_i32_1N_4K) {
using namespace test::gemm::threadblock;
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
const int num_batch = 16;
const int LDG_N = 1;
const int LDG_K = 4;
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
batched_gemv_threadblock_test<Shape,
int8_t, int32_t,
cutlass::layout::RowMajor,
cutlass::layout::ColumnMajor,
cutlass::layout::ColumnMajor,
LDG_N, LDG_K>(problem_size, num_batch);
}

View File

@ -0,0 +1,124 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "../../common/cutlass_unit_test.h"
#include "cutlass/epilogue/epilogue_workspace.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace test {
namespace gemm {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Kernel computes accumulator data and stores it out
template <typename Epilogue>
__global__ void kernel_epilogue_workspace(typename Epilogue::Params params) {
__shared__ typename Epilogue::SharedStorage shared_storage;
int warp_id = threadIdx.y;
int lane_id = threadIdx.x;
Epilogue epilogue(params, shared_storage, warp_id, lane_id);
//
// Initialize accumulator tile
//
typename Epilogue::FragmentC accum;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < Epilogue::FragmentC::kElements; ++i) {
accum[i] = Element(warp_id * blockDim.x + lane_id);
}
//
// Efficient epilogue
//
cutlass::GemmCoord tb_tile_coord{blockIdx.x, blockIdx.y, 0};
cutlass::GemmCoord problem_size =
tb_tile_coord *
cutlass::GemmCoord{Epilogue::Shape::kM, Epilogue::Shape::kN, 1};
// Store accumulators
epilogue(
problem_size,
tb_tile_coord,
accum);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace test
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM75_gemm_threadblock_epilogue_workspace, tensor_op_128x128_64x64) {
//
// Define an instance of the epilogue and see if it works
//
static int const kWarpCount = 4;
static int const kWarpSize = 32;
using Shape = cutlass::MatrixShape<128, 128>;
using FragmentC = cutlass::Array<int, Shape::kCount / (kWarpCount * kWarpSize)>;
using Epilogue = cutlass::gemm::threadblock::EpilogueWorkspace<
Shape,
kWarpCount,
FragmentC
>;
typename Epilogue::Params params(
);
// Launch the kernel
dim3 grid(1,1);
dim3 block(kWarpSize, kWarpCount);
test::gemm::threadblock::kernel_epilogue_workspace<Epilogue><<< grid, block >>>(
params
);
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess) << "Kernel launch error - " << cudaGetErrorString(result);
//
//
//
}
/////////////////////////////////////////////////////////////////////////////////////////////////

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,492 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "mma_pipelined_testbed.h"
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_64x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x128x32_64x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_32x32x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x32_64x32x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x64_64x32x64_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x128x32_32x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_congruous, tensor_op_256x128x32_32x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(256, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 8, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x128x32_64x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_256x128x32_64x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(256, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 8, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_32x32x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x32_64x32x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x64_64x32x64_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x128x32_32x64x32_8x8x4) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
cutlass::gemm::GemmCoord problem_size(64, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassTensorOp>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,334 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
*notice, this list of conditions and the following disclaimer in the
*documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its
*contributors may be used to endorse or promote products derived from this
*software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit testbed for kernel-level GEMM
*/
#pragma once
#include "../../common/cutlass_unit_test.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/layout/vector.h"
#include "cutlass/numeric_types.h"
#include "cutlass/core_io.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
#include "cutlass/cutlass.h"
#include "cutlass/platform/platform.h"
namespace test {
namespace gemm {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename Mma>
__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
typename Mma::IteratorA::Params params_A,
typename Mma::IteratorA::TensorRef ref_A,
typename Mma::IteratorB::Params params_B,
typename Mma::IteratorB::TensorRef ref_B,
typename Mma::ElementC *ptr_C, int ldc) {
// Shared storage needed by threadblock-scoped matrix multiply-accumulate
__shared__ typename Mma::SharedStorage shared_storage;
// Compute threadblock location
cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
0};
cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
tb_tile_offset.k()};
cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
tb_tile_offset.n() * Mma::Shape::kN};
// Compute position within threadblock
int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
// Construct iterators to A and B operands
typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
{problem_size.m(), problem_size.k()},
tb_thread_id, tb_offset_A);
typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
{problem_size.k(), problem_size.n()},
tb_thread_id, tb_offset_B);
int warp_id = threadIdx.y;
int lane_id = threadIdx.x;
// Construct thread-scoped matrix multiply
Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
typename Mma::FragmentC accum;
accum.clear();
int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
// Compute threadblock-scoped matrix multiply-add
mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
// Output results
typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id);
iterator_C.add_tile_offset(
{(tb_tile_offset.m() * Mma::WarpCount::kM) +
(warp_id % Mma::WarpCount::kM),
(tb_tile_offset.n() * Mma::WarpCount::kN) +
(warp_id / Mma::WarpCount::kM)});
iterator_C.store(accum);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product
template <
/// Threadblock-level matrix multiply-accumulate
typename MmaCore_,
/// Number of stages
int Stages = 2>
struct Testbed {
/// Threadblock-level GEMM implementation
using MmaCore = MmaCore_;
using ThreadblockShape = typename MmaCore::Shape;
using WarpShape = typename MmaCore::WarpShape;
using InstructionShape = typename MmaCore::InstructionShape;
using ElementA = typename MmaCore::ElementA;
using LayoutA = typename MmaCore::LayoutA;
using ElementB = typename MmaCore::ElementB;
using LayoutB = typename MmaCore::LayoutB;
using ElementC = typename MmaCore::ElementC;
using LayoutC = typename MmaCore::LayoutC;
static const int kStages = Stages;
// Define iterators over tiles from the A operand
static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value &&
cutlass::platform::is_same<ElementB, int8_t>::value &&
cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
static const bool transposeA = cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
static const bool transposeB = cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
using IteratorA = typename cutlass::platform::conditional< use_idp4a,
cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
>::type;
// Define iterators over tiles from the B operand
using IteratorB = typename cutlass::platform::conditional< use_idp4a,
cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
>::type;
// Define MmaPipeline Single Stage
using MmaPipelineSingleStage = cutlass::gemm::threadblock::MmaSingleStage<
typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
typename MmaCore::MmaPolicy>;
// Define MmaPipeline Two Stages
using MmaPipelineTwoStages = cutlass::gemm::threadblock::MmaPipelined<
typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
typename MmaCore::MmaPolicy>;
// Define the threadblock-scoped pipelined matrix multiply (Select between Single vs. Two stages)
using Mma = typename cutlass::platform::conditional<(kStages==1), MmaPipelineSingleStage, MmaPipelineTwoStages>::type;
//
// Data members
//
cutlass::HostTensor<ElementA, LayoutA> matrix_A;
cutlass::HostTensor<ElementB, LayoutB> matrix_B;
cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
cutlass::gemm::GemmCoord problem_size;
float alpha, beta;
//
// Methods
//
/// Allocates workspace in device memory
Testbed(int m, int n, int k, float alpha_, float beta_)
: problem_size(m, n, k), alpha(alpha_), beta(beta_) {
matrix_A.reset(cutlass::make_Coord(m, k));
matrix_B.reset(cutlass::make_Coord(k, n));
matrix_C_computed.reset(cutlass::make_Coord(m, n));
matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
}
/// Runs the test
bool run(
dim3 grid, dim3 block,
cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
//
// initialize device memory
//
if (init_A == cutlass::Distribution::Uniform) {
int scope_max = 8;
int scope_min = -8;
if (cutlass::sizeof_bits<ElementA>::value == 4) {
scope_max = 2;
scope_min = -2;
} else if (cutlass::sizeof_bits<ElementA>::value == 1) {
scope_max = 2;
scope_min = 0;
}
uint64_t seed = 7;
cutlass::reference::host::TensorFillRandomUniform(
matrix_A.host_view(), seed, scope_max, scope_min, 0);
} else if (init_A == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
matrix_A.capacity());
} else if (init_A == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
} else {
// TODO: Implement the rest
return false;
}
if (init_B == cutlass::Distribution::Uniform) {
int scope_max = 8;
int scope_min = -8;
if (cutlass::sizeof_bits<ElementB>::value == 4) {
scope_max = 2;
scope_min = -2;
} else if (cutlass::sizeof_bits<ElementB>::value == 1) {
scope_max = 2;
scope_min = 0;
}
uint64_t seed = 7;
cutlass::reference::host::TensorFillRandomUniform(
matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
} else if (init_B == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
matrix_B.capacity());
} else if (init_B == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
} else {
// TODO: Implement the rest
return false;
}
cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
matrix_A.sync_device();
matrix_B.sync_device();
matrix_C_computed.sync_device();
typename IteratorA::Params params_A(matrix_A.layout());
typename IteratorB::Params params_B(matrix_B.layout());
test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
problem_size, params_A, matrix_A.device_ref(), params_B,
matrix_B.device_ref(), matrix_C_computed.device_data(),
matrix_C_computed.layout().stride(0));
//
// Check error code
//
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess)
<< " kernel error: " << cudaGetErrorString(result);
matrix_C_computed.sync_host();
cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
ElementC, LayoutC, ElementC, ElementC,
typename MmaCore::Operator>
reference_gemm;
reference_gemm(
problem_size, ElementC(alpha), matrix_A.host_view(),
matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
bool passed = cutlass::reference::host::TensorEquals(
matrix_C_computed.host_view(), matrix_C_reference.host_view());
EXPECT_TRUE(passed);
if (!passed) {
std::ofstream output("mma_pipelined_testbed_errors.txt");
output
<< "A:\n" << matrix_A.host_view() << "\n"
<< "B:\n" << matrix_B.host_view() << "\n"
<< "Reference:\n"
<< matrix_C_reference.host_view() << "\n"
<< "Computed:\n"
<< matrix_C_computed.host_view() << "\n";
}
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace test

View File

@ -0,0 +1,760 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "cutlass/arch/wmma.h"
#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
#include "mma_pipelined_testbed.h"
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
//////////////// [START] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [START] //////////////////////
///////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
////////////////////////////////////////////////////////////
// tests for {N,T}x{N,T}=>{T}
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(128, 128, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(128, 128, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
// tests for {N,T}x{N,T}=>{N}
///////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_col_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_col_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_col_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_col_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
//////////////// [END] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [END] //////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(2, 2);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
//////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16 (wmma native size 8x32x16)
//////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
//////////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32 (wmma native size 16x16x16)
//////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(2, 2);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32 (wmma native size 32x8x16)
////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32 (wmma native size 8x32x16)
/////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED

View File

@ -0,0 +1,331 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "cutlass/arch/wmma.h"
#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
#include "mma_pipelined_testbed.h"
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
/////////////////////////////////////////////////////////////////////////
/// Integer (s8 and u8) WMMA threadblock level tests /////
/////////////////////////////////////////////////////////////////////////
#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
////////////////////////////////////////////////////////////////////////
/// SUBBYTE (s4 and b1) WMMA threadblock level tests ////
///////////////////////////////////////////////////////////////////////
#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
using ElementA = cutlass::int4b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::int4b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
using ElementA = cutlass::int4b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::int4b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 64);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
using ElementA = cutlass::uint1b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::uint1b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages,
cutlass::arch::OpXorPopc>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
using ElementA = cutlass::uint1b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::uint1b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 2;
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages,
cutlass::arch::OpXorPopc>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED

View File

@ -0,0 +1,411 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "cutlass/arch/wmma.h"
#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
#include "mma_pipelined_testbed.h"
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
///////////////////////////////////////////////////////////////////////////////////////////////////////
/// WMMA Floating point (f16 accumulation) - Single stage - Threadblock level tests ////
///////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(2, 2);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
///////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
//////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16 (wmma native size 8x32x16)
//////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = cutlass::half_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
/// WMMA Floating point (f32 accumulation) - Single stage - Threadblock level tests ////
///////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32 (wmma native size 16x16x16)
//////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(2, 2);
dim3 block(32, 4, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
///////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32 (wmma native size 32x8x16)
////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
/////////////////////////////////////////////////////////////////////////////////
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32 (wmma native size 8x32x16)
/////////////////////////////////////////////////////////////////////////////////
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
using ElementA = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::half_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = float;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED

View File

@ -0,0 +1,331 @@
/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Unit tests for thread-level GEMM
*/
#include "cutlass/arch/wmma.h"
#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
#include "mma_pipelined_testbed.h"
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
/////////////////////////////////////////////////////////////////////////
/// Integer (s8 and u8) WMMA threadblock level tests ////
/////////////////////////////////////////////////////////////////////////
#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
using ElementA = int8_t;
using LayoutA = cutlass::layout::ColumnMajor;
using ElementB = int8_t;
using LayoutB = cutlass::layout::RowMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
float alpha = 1.f;
float beta = 0.0f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
////////////////////////////////////////////////////////////////////////
/// SUBBYTE (s4 and b1) WMMA threadblock level tests ////
///////////////////////////////////////////////////////////////////////
#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
using ElementA = cutlass::int4b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::int4b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
using ElementA = cutlass::int4b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::int4b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 64);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
using ElementA = cutlass::uint1b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::uint1b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::RowMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages,
cutlass::arch::OpXorPopc>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
using ElementA = cutlass::uint1b_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementB = cutlass::uint1b_t;
using LayoutB = cutlass::layout::ColumnMajor;
using ElementC = int32_t;
using LayoutC = cutlass::layout::ColumnMajor;
static const int kStages = 1;
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
float alpha = 1.f;
float beta = 0.f;
// Define the MmaCore components
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadBlockShape, WarpShape, InstructionShape,
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
cutlass::arch::OpClassWmmaTensorOp, kStages,
cutlass::arch::OpXorPopc>;
dim3 grid(1, 1);
dim3 block(32, 1, 1);
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
problem_size.k(), alpha, beta)
.run(grid, block);
}
#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED