CUTLASS 2.0 (#62)
CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
This commit is contained in:
33
test/unit/gemm/threadblock/CMakeLists.txt
Normal file
33
test/unit/gemm/threadblock/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_gemm_threadblock
|
||||
mma_pipelined_wmma_sm70.cu
|
||||
mma_pipelined_wmma_sm75.cu
|
||||
mma_singlestage_wmma_sm70.cu
|
||||
mma_singlestage_wmma_sm75.cu
|
||||
mma_pipelined_sm70.cu
|
||||
mma_pipelined_sm75.cu
|
||||
mma_pipelined_simt.cu
|
||||
)
|
||||
|
||||
640
test/unit/gemm/threadblock/batched_gemv.cu
Normal file
640
test/unit/gemm/threadblock/batched_gemv.cu
Normal file
@ -0,0 +1,640 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for threadblock level GEMV
|
||||
*/
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/aligned_buffer.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/gemm/gemm.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
|
||||
#include "cutlass/core_io.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
|
||||
#include "cutlass/gemm/threadblock/gemv.h"
|
||||
#include "cutlass/gemm/threadblock/default_gemv_core.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace test {
|
||||
namespace gemm {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Gemv, typename LongIndex, typename RefA, typename RefB, typename RefC>
|
||||
__global__ void batched_gemv_threadblock_test_kernel(
|
||||
cutlass::gemm::GemmCoord problem_size,
|
||||
LongIndex stride_a,
|
||||
LongIndex stride_b,
|
||||
LongIndex stride_c,
|
||||
RefA ref_A,
|
||||
RefB ref_B,
|
||||
RefC ref_C
|
||||
) {
|
||||
|
||||
typename Gemv::IteratorA::TensorCoord threadblock_offset_A(0, 0);
|
||||
typename Gemv::IteratorB::TensorCoord threadblock_offset_B(0, 0);
|
||||
typename Gemv::IteratorB::TensorCoord threadblock_offset_C(0, 0);
|
||||
|
||||
// Move to the right batches for these threads
|
||||
ref_A.add_pointer_offset(threadIdx.y * stride_a);
|
||||
ref_B.add_pointer_offset(threadIdx.y * stride_b);
|
||||
ref_C.add_pointer_offset(threadIdx.y * stride_c);
|
||||
|
||||
// Construct iterators to A and B operands
|
||||
typename Gemv::IteratorA::Params params_A(ref_A.layout());
|
||||
typename Gemv::IteratorA iterator_A(params_A, ref_A.data(), { problem_size.m(), problem_size.k() }, 0, threadblock_offset_A);
|
||||
typename Gemv::IteratorB::Params params_B(ref_B.layout());
|
||||
typename Gemv::IteratorB iterator_B(params_B, ref_B.data(), { problem_size.k(), problem_size.n() }, threadIdx.x, threadblock_offset_B);
|
||||
|
||||
Gemv gemv;
|
||||
|
||||
typename Gemv::FragmentC accum;
|
||||
accum.clear();
|
||||
|
||||
// Compute threadblock-scoped matrix multiply-add
|
||||
gemv(problem_size, accum, iterator_A, iterator_B, accum);
|
||||
|
||||
// IteratorC is PitchLinear<> assumes n() contiguous
|
||||
typename Gemv::IteratorC::Params params_C(ref_C.layout());
|
||||
typename Gemv::IteratorC iterator_C(params_C, ref_C.data(), { problem_size.m(), problem_size.n() }, threadIdx.x, threadblock_offset_C);
|
||||
iterator_C.store(accum);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename Shape_,
|
||||
typename ElementAB_,
|
||||
typename ElementC_,
|
||||
typename LayoutA_,
|
||||
typename LayoutB_,
|
||||
typename LayoutC_,
|
||||
int LDG_N,
|
||||
int LDG_K,
|
||||
int MAX_THREADS_PER_BLOCK=512,
|
||||
bool DEBUG=false>
|
||||
void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int num_batch)
|
||||
{
|
||||
using Shape = Shape_;
|
||||
using ElementA = ElementAB_;
|
||||
using LayoutA = LayoutA_;
|
||||
using ElementB = ElementAB_;
|
||||
using LayoutB = LayoutB_;
|
||||
using ElementC = ElementC_;
|
||||
using LayoutC = LayoutC_;
|
||||
using ThreadShape = cutlass::gemm::GemmShape<1, LDG_N, LDG_K>;
|
||||
|
||||
using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
|
||||
Shape,
|
||||
ThreadShape,
|
||||
ElementA,
|
||||
LayoutA,
|
||||
ElementB,
|
||||
LayoutB,
|
||||
ElementC,
|
||||
LayoutC
|
||||
>;
|
||||
|
||||
if (DEBUG)
|
||||
{
|
||||
num_batch = 1;
|
||||
}
|
||||
|
||||
using Mma = cutlass::gemm::threadblock::Gemv<Core>;
|
||||
|
||||
// Create host tensors that will be the backing store for the batches
|
||||
// Note that no device memory is initially allocated
|
||||
cutlass::HostTensor<ElementA, LayoutA> matrix_A({problem_size.m(), problem_size.k()}, false);
|
||||
cutlass::HostTensor<ElementB, LayoutB> matrix_B({problem_size.k(), problem_size.n()}, false);
|
||||
cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed({problem_size.m(), problem_size.n()}, false);
|
||||
cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference({problem_size.m(), problem_size.n()}, false);
|
||||
|
||||
// Reserve memory for the batch of tensors
|
||||
matrix_A.reserve(problem_size.m()*problem_size.k()*num_batch);
|
||||
matrix_B.reserve(problem_size.n()*problem_size.k()*num_batch);
|
||||
matrix_C_computed.reserve(problem_size.m()*problem_size.n()*num_batch);
|
||||
matrix_C_reference.reserve(problem_size.m()*problem_size.n()*num_batch, false);
|
||||
|
||||
// Fill eatch tensor batch
|
||||
const int seed = 6834;
|
||||
for (int b = 0; b < num_batch; b++)
|
||||
{
|
||||
if(DEBUG)
|
||||
{
|
||||
cutlass::reference::host::BlockFillSequential(
|
||||
matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity());
|
||||
cutlass::reference::host::BlockFillSequential(
|
||||
matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity());
|
||||
}
|
||||
else
|
||||
{
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
matrix_A.host_view(b*matrix_A.capacity()),
|
||||
seed + 1660,
|
||||
8,
|
||||
-8,
|
||||
0
|
||||
);
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
matrix_B.host_view(b*matrix_B.capacity()),
|
||||
seed + 1880,
|
||||
8,
|
||||
-8,
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity()));
|
||||
cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
|
||||
}
|
||||
|
||||
matrix_A.sync_device();
|
||||
matrix_B.sync_device();
|
||||
matrix_C_computed.sync_device();
|
||||
|
||||
dim3 grid(1, 1); // only 1 CTA is used
|
||||
dim3 block(Shape::kN / LDG_N, num_batch, 1);
|
||||
|
||||
#if 0
|
||||
printf("block dim = %d x %d\n", block.x, block.y);
|
||||
#endif
|
||||
|
||||
// Some sanity checks
|
||||
EXPECT_TRUE( problem_size.n() % LDG_N == 0 );
|
||||
EXPECT_TRUE( block.x*block.y <= MAX_THREADS_PER_BLOCK );
|
||||
|
||||
test::gemm::threadblock::batched_gemv_threadblock_test_kernel<Mma><<< grid, block >>>(
|
||||
problem_size,
|
||||
matrix_A.capacity(),
|
||||
matrix_B.capacity(),
|
||||
matrix_C_computed.capacity(),
|
||||
matrix_A.device_ref(),
|
||||
matrix_B.device_ref(),
|
||||
matrix_C_computed.device_ref()
|
||||
);
|
||||
|
||||
cudaError_t result = cudaDeviceSynchronize();
|
||||
EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);
|
||||
|
||||
matrix_C_computed.sync_host();
|
||||
|
||||
// Compute the batched gemms
|
||||
for (int b = 0; b < num_batch; b++)
|
||||
{
|
||||
|
||||
cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
|
||||
ElementC, LayoutC, ElementC, ElementC> reference_gemm;
|
||||
|
||||
reference_gemm(
|
||||
problem_size.mnk(),
|
||||
ElementC(1),
|
||||
matrix_A.host_ref(b*matrix_A.capacity()),
|
||||
matrix_B.host_ref(b*matrix_B.capacity()),
|
||||
ElementC(0),
|
||||
matrix_C_reference.host_ref(b*matrix_C_computed.capacity())
|
||||
);
|
||||
|
||||
bool passed = cutlass::reference::host::TensorEquals(
|
||||
matrix_C_computed.host_view(b*matrix_C_computed.capacity()),
|
||||
matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
|
||||
|
||||
EXPECT_TRUE(passed)
|
||||
//<< "A:\n" << matrix_A.host_view() << "\n"
|
||||
//<< "B:\n" << matrix_B.host_view() << "\n"
|
||||
<< "Batch: " << b << "\n"
|
||||
<< "Reference:\n" << matrix_C_reference.host_view(b*matrix_C_reference.capacity()) << "\n"
|
||||
<< "Computed:\n" << matrix_C_computed.host_view(b*matrix_C_computed.capacity()) << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace threadblock
|
||||
} // namespace gemm
|
||||
} // namespace test
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// A: ColumnMajor
|
||||
// B: RowMajor
|
||||
// C: ColumnMajor
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp32_fp32_2N_2K) {
|
||||
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_crc_fp32_fp32_4N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
|
||||
const int num_batch = 5;
|
||||
const int LDG_N = 4;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp32_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
float, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_2K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_8K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 8;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp16_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_i8_i32_2N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_i8_i32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
// A: RowMajor
|
||||
// B: ColumnMajor
|
||||
// C: RowMajor
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp32_fp32_2N_2K) {
|
||||
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcr_fp32_fp32_4N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
|
||||
const int num_batch = 5;
|
||||
const int LDG_N = 4;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp32_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_2K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_8K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 8;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp16_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_i8_i32_2N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_i8_i32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::RowMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
// A: RowMajor
|
||||
// B: ColumnMajor
|
||||
// C: ColumnMajor
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp32_fp32_2N_2K) {
|
||||
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcc_fp32_fp32_4N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 128, 128);
|
||||
const int num_batch = 5;
|
||||
const int LDG_N = 4;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape, float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp32_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
float, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_2K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 2;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_8K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 8;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp16_fp32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
cutlass::half_t, float,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_i8_i32_2N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 64, 64);
|
||||
const int num_batch = 4;
|
||||
const int LDG_N = 2;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
|
||||
TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_i8_i32_1N_4K) {
|
||||
using namespace test::gemm::threadblock;
|
||||
cutlass::gemm::GemmCoord problem_size(1, 17, 64);
|
||||
const int num_batch = 16;
|
||||
const int LDG_N = 1;
|
||||
const int LDG_K = 4;
|
||||
|
||||
using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
|
||||
batched_gemv_threadblock_test<Shape,
|
||||
int8_t, int32_t,
|
||||
cutlass::layout::RowMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::layout::ColumnMajor,
|
||||
LDG_N, LDG_K>(problem_size, num_batch);
|
||||
}
|
||||
124
test/unit/gemm/threadblock/epilogue_workspace.cu
Normal file
124
test/unit/gemm/threadblock/epilogue_workspace.cu
Normal file
@ -0,0 +1,124 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/epilogue/epilogue_workspace.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace test {
|
||||
namespace gemm {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel computes accumulator data and stores it out
|
||||
template <typename Epilogue>
|
||||
__global__ void kernel_epilogue_workspace(typename Epilogue::Params params) {
|
||||
|
||||
__shared__ typename Epilogue::SharedStorage shared_storage;
|
||||
|
||||
int warp_id = threadIdx.y;
|
||||
int lane_id = threadIdx.x;
|
||||
|
||||
Epilogue epilogue(params, shared_storage, warp_id, lane_id);
|
||||
|
||||
//
|
||||
// Initialize accumulator tile
|
||||
//
|
||||
typename Epilogue::FragmentC accum;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < Epilogue::FragmentC::kElements; ++i) {
|
||||
accum[i] = Element(warp_id * blockDim.x + lane_id);
|
||||
}
|
||||
|
||||
//
|
||||
// Efficient epilogue
|
||||
//
|
||||
|
||||
cutlass::GemmCoord tb_tile_coord{blockIdx.x, blockIdx.y, 0};
|
||||
|
||||
cutlass::GemmCoord problem_size =
|
||||
tb_tile_coord *
|
||||
cutlass::GemmCoord{Epilogue::Shape::kM, Epilogue::Shape::kN, 1};
|
||||
|
||||
// Store accumulators
|
||||
epilogue(
|
||||
problem_size,
|
||||
tb_tile_coord,
|
||||
accum);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace threadblock
|
||||
} // namespace gemm
|
||||
} // namespace test
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM75_gemm_threadblock_epilogue_workspace, tensor_op_128x128_64x64) {
|
||||
|
||||
//
|
||||
// Define an instance of the epilogue and see if it works
|
||||
//
|
||||
static int const kWarpCount = 4;
|
||||
static int const kWarpSize = 32;
|
||||
|
||||
using Shape = cutlass::MatrixShape<128, 128>;
|
||||
using FragmentC = cutlass::Array<int, Shape::kCount / (kWarpCount * kWarpSize)>;
|
||||
|
||||
using Epilogue = cutlass::gemm::threadblock::EpilogueWorkspace<
|
||||
Shape,
|
||||
kWarpCount,
|
||||
FragmentC
|
||||
>;
|
||||
|
||||
typename Epilogue::Params params(
|
||||
|
||||
);
|
||||
|
||||
// Launch the kernel
|
||||
dim3 grid(1,1);
|
||||
dim3 block(kWarpSize, kWarpCount);
|
||||
|
||||
test::gemm::threadblock::kernel_epilogue_workspace<Epilogue><<< grid, block >>>(
|
||||
params
|
||||
);
|
||||
|
||||
cudaError_t result = cudaDeviceSynchronize();
|
||||
EXPECT_EQ(result, cudaSuccess) << "Kernel launch error - " << cudaGetErrorString(result);
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
1016
test/unit/gemm/threadblock/mma_pipelined_simt.cu
Normal file
1016
test/unit/gemm/threadblock/mma_pipelined_simt.cu
Normal file
File diff suppressed because it is too large
Load Diff
492
test/unit/gemm/threadblock/mma_pipelined_sm70.cu
Normal file
492
test/unit/gemm/threadblock/mma_pipelined_sm70.cu
Normal file
@ -0,0 +1,492 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
|
||||
#include "mma_pipelined_testbed.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_64x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x128x32_64x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_32x32x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x32_64x32x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x64_64x32x64_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
|
||||
using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
|
||||
LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_64x128x32_32x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_congruous, tensor_op_256x128x32_32x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 8, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x128x32_64x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_256x128x32_64x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 8, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_32x32x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x32_64x32x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x64_64x32x64_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
|
||||
using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
|
||||
LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x128x32_32x64x32_8x8x4) {
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassTensorOp>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED
|
||||
1788
test/unit/gemm/threadblock/mma_pipelined_sm75.cu
Normal file
1788
test/unit/gemm/threadblock/mma_pipelined_sm75.cu
Normal file
File diff suppressed because it is too large
Load Diff
334
test/unit/gemm/threadblock/mma_pipelined_testbed.h
Normal file
334
test/unit/gemm/threadblock/mma_pipelined_testbed.h
Normal file
@ -0,0 +1,334 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
*modification, are permitted provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
*this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
*notice, this list of conditions and the following disclaimer in the
|
||||
*documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its
|
||||
*contributors may be used to endorse or promote products derived from this
|
||||
*software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
|
||||
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
|
||||
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit testbed for kernel-level GEMM
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/aligned_buffer.h"
|
||||
#include "cutlass/gemm/gemm.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
#include "cutlass/layout/vector.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/core_io.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
|
||||
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
|
||||
#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/platform/platform.h"
|
||||
|
||||
namespace test {
|
||||
namespace gemm {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Mma>
|
||||
__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
|
||||
typename Mma::IteratorA::Params params_A,
|
||||
typename Mma::IteratorA::TensorRef ref_A,
|
||||
typename Mma::IteratorB::Params params_B,
|
||||
typename Mma::IteratorB::TensorRef ref_B,
|
||||
typename Mma::ElementC *ptr_C, int ldc) {
|
||||
// Shared storage needed by threadblock-scoped matrix multiply-accumulate
|
||||
__shared__ typename Mma::SharedStorage shared_storage;
|
||||
|
||||
// Compute threadblock location
|
||||
cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
|
||||
0};
|
||||
|
||||
cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
|
||||
tb_tile_offset.k()};
|
||||
|
||||
cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
|
||||
tb_tile_offset.n() * Mma::Shape::kN};
|
||||
|
||||
// Compute position within threadblock
|
||||
int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Construct iterators to A and B operands
|
||||
typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
|
||||
{problem_size.m(), problem_size.k()},
|
||||
tb_thread_id, tb_offset_A);
|
||||
|
||||
typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
|
||||
{problem_size.k(), problem_size.n()},
|
||||
tb_thread_id, tb_offset_B);
|
||||
|
||||
int warp_id = threadIdx.y;
|
||||
int lane_id = threadIdx.x;
|
||||
|
||||
// Construct thread-scoped matrix multiply
|
||||
Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
|
||||
|
||||
typename Mma::FragmentC accum;
|
||||
|
||||
accum.clear();
|
||||
|
||||
int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
|
||||
|
||||
// Compute threadblock-scoped matrix multiply-add
|
||||
mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
|
||||
|
||||
// Output results
|
||||
typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id);
|
||||
|
||||
iterator_C.add_tile_offset(
|
||||
{(tb_tile_offset.m() * Mma::WarpCount::kM) +
|
||||
(warp_id % Mma::WarpCount::kM),
|
||||
(tb_tile_offset.n() * Mma::WarpCount::kN) +
|
||||
(warp_id / Mma::WarpCount::kM)});
|
||||
|
||||
iterator_C.store(accum);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Structure to compute the matrix product
|
||||
template <
|
||||
/// Threadblock-level matrix multiply-accumulate
|
||||
typename MmaCore_,
|
||||
/// Number of stages
|
||||
int Stages = 2>
|
||||
struct Testbed {
|
||||
/// Threadblock-level GEMM implementation
|
||||
using MmaCore = MmaCore_;
|
||||
using ThreadblockShape = typename MmaCore::Shape;
|
||||
using WarpShape = typename MmaCore::WarpShape;
|
||||
using InstructionShape = typename MmaCore::InstructionShape;
|
||||
using ElementA = typename MmaCore::ElementA;
|
||||
using LayoutA = typename MmaCore::LayoutA;
|
||||
using ElementB = typename MmaCore::ElementB;
|
||||
using LayoutB = typename MmaCore::LayoutB;
|
||||
using ElementC = typename MmaCore::ElementC;
|
||||
using LayoutC = typename MmaCore::LayoutC;
|
||||
static const int kStages = Stages;
|
||||
|
||||
// Define iterators over tiles from the A operand
|
||||
static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value &&
|
||||
cutlass::platform::is_same<ElementB, int8_t>::value &&
|
||||
cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
|
||||
|
||||
static const bool transposeA = cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
|
||||
static const bool transposeB = cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
|
||||
|
||||
using IteratorA = typename cutlass::platform::conditional< use_idp4a,
|
||||
cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
|
||||
cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
|
||||
ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
|
||||
|
||||
cutlass::transform::threadblock::PredicatedTileIterator<
|
||||
cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
|
||||
ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
|
||||
>::type;
|
||||
|
||||
// Define iterators over tiles from the B operand
|
||||
using IteratorB = typename cutlass::platform::conditional< use_idp4a,
|
||||
cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
|
||||
cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
|
||||
ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
|
||||
|
||||
cutlass::transform::threadblock::PredicatedTileIterator<
|
||||
cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
|
||||
ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
|
||||
>::type;
|
||||
|
||||
// Define MmaPipeline Single Stage
|
||||
using MmaPipelineSingleStage = cutlass::gemm::threadblock::MmaSingleStage<
|
||||
typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
|
||||
IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
|
||||
typename MmaCore::MmaPolicy>;
|
||||
|
||||
// Define MmaPipeline Two Stages
|
||||
using MmaPipelineTwoStages = cutlass::gemm::threadblock::MmaPipelined<
|
||||
typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
|
||||
IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
|
||||
typename MmaCore::MmaPolicy>;
|
||||
|
||||
// Define the threadblock-scoped pipelined matrix multiply (Select between Single vs. Two stages)
|
||||
using Mma = typename cutlass::platform::conditional<(kStages==1), MmaPipelineSingleStage, MmaPipelineTwoStages>::type;
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
cutlass::HostTensor<ElementA, LayoutA> matrix_A;
|
||||
cutlass::HostTensor<ElementB, LayoutB> matrix_B;
|
||||
cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
|
||||
cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size;
|
||||
float alpha, beta;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Allocates workspace in device memory
|
||||
Testbed(int m, int n, int k, float alpha_, float beta_)
|
||||
: problem_size(m, n, k), alpha(alpha_), beta(beta_) {
|
||||
matrix_A.reset(cutlass::make_Coord(m, k));
|
||||
matrix_B.reset(cutlass::make_Coord(k, n));
|
||||
matrix_C_computed.reset(cutlass::make_Coord(m, n));
|
||||
matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
|
||||
}
|
||||
|
||||
/// Runs the test
|
||||
bool run(
|
||||
dim3 grid, dim3 block,
|
||||
cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
|
||||
//
|
||||
// initialize device memory
|
||||
//
|
||||
|
||||
if (init_A == cutlass::Distribution::Uniform) {
|
||||
|
||||
int scope_max = 8;
|
||||
int scope_min = -8;
|
||||
|
||||
if (cutlass::sizeof_bits<ElementA>::value == 4) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
} else if (cutlass::sizeof_bits<ElementA>::value == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
|
||||
uint64_t seed = 7;
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
matrix_A.host_view(), seed, scope_max, scope_min, 0);
|
||||
} else if (init_A == cutlass::Distribution::Sequential) {
|
||||
cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
|
||||
matrix_A.capacity());
|
||||
} else if (init_A == cutlass::Distribution::Identity) {
|
||||
cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
|
||||
} else {
|
||||
// TODO: Implement the rest
|
||||
return false;
|
||||
}
|
||||
|
||||
if (init_B == cutlass::Distribution::Uniform) {
|
||||
|
||||
int scope_max = 8;
|
||||
int scope_min = -8;
|
||||
|
||||
if (cutlass::sizeof_bits<ElementB>::value == 4) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
} else if (cutlass::sizeof_bits<ElementB>::value == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
|
||||
uint64_t seed = 7;
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
|
||||
} else if (init_B == cutlass::Distribution::Sequential) {
|
||||
cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
|
||||
matrix_B.capacity());
|
||||
} else if (init_B == cutlass::Distribution::Identity) {
|
||||
cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
|
||||
} else {
|
||||
// TODO: Implement the rest
|
||||
return false;
|
||||
}
|
||||
|
||||
cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
|
||||
|
||||
cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
|
||||
|
||||
matrix_A.sync_device();
|
||||
matrix_B.sync_device();
|
||||
matrix_C_computed.sync_device();
|
||||
|
||||
typename IteratorA::Params params_A(matrix_A.layout());
|
||||
typename IteratorB::Params params_B(matrix_B.layout());
|
||||
|
||||
test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
|
||||
problem_size, params_A, matrix_A.device_ref(), params_B,
|
||||
matrix_B.device_ref(), matrix_C_computed.device_data(),
|
||||
matrix_C_computed.layout().stride(0));
|
||||
|
||||
//
|
||||
// Check error code
|
||||
//
|
||||
|
||||
cudaError_t result = cudaDeviceSynchronize();
|
||||
EXPECT_EQ(result, cudaSuccess)
|
||||
<< " kernel error: " << cudaGetErrorString(result);
|
||||
|
||||
matrix_C_computed.sync_host();
|
||||
|
||||
cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
|
||||
ElementC, LayoutC, ElementC, ElementC,
|
||||
typename MmaCore::Operator>
|
||||
reference_gemm;
|
||||
|
||||
reference_gemm(
|
||||
problem_size, ElementC(alpha), matrix_A.host_view(),
|
||||
matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
|
||||
|
||||
bool passed = cutlass::reference::host::TensorEquals(
|
||||
matrix_C_computed.host_view(), matrix_C_reference.host_view());
|
||||
|
||||
EXPECT_TRUE(passed);
|
||||
|
||||
if (!passed) {
|
||||
std::ofstream output("mma_pipelined_testbed_errors.txt");
|
||||
|
||||
output
|
||||
<< "A:\n" << matrix_A.host_view() << "\n"
|
||||
<< "B:\n" << matrix_B.host_view() << "\n"
|
||||
<< "Reference:\n"
|
||||
<< matrix_C_reference.host_view() << "\n"
|
||||
<< "Computed:\n"
|
||||
<< matrix_C_computed.host_view() << "\n";
|
||||
}
|
||||
|
||||
return passed;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace threadblock
|
||||
} // namespace gemm
|
||||
} // namespace test
|
||||
760
test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
Normal file
760
test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
Normal file
@ -0,0 +1,760 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
#include "cutlass/arch/wmma.h"
|
||||
|
||||
#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
|
||||
#include "mma_pipelined_testbed.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
|
||||
|
||||
/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
|
||||
/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
|
||||
|
||||
//////////////// [START] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [START] //////////////////////
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
// tests for {N,T}x{N,T}=>{T}
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
// tests for {N,T}x{N,T}=>{N}
|
||||
///////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_col_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_col_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_col_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_col_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
//////////////// [END] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [END] //////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(2, 2);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16 (wmma native size 8x32x16)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32 (wmma native size 16x16x16)
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(2, 2);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32 (wmma native size 32x8x16)
|
||||
////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32 (wmma native size 8x32x16)
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED
|
||||
331
test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
Normal file
331
test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
Normal file
@ -0,0 +1,331 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
#include "cutlass/arch/wmma.h"
|
||||
|
||||
#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
|
||||
#include "mma_pipelined_testbed.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
|
||||
|
||||
/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
|
||||
/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/// Integer (s8 and u8) WMMA threadblock level tests /////
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// SUBBYTE (s4 and b1) WMMA threadblock level tests ////
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
|
||||
using ElementA = cutlass::int4b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::int4b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
|
||||
using ElementA = cutlass::int4b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::int4b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 64);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::uint1b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages,
|
||||
cutlass::arch::OpXorPopc>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::uint1b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 2;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages,
|
||||
cutlass::arch::OpXorPopc>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
|
||||
|
||||
#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED
|
||||
411
test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
Normal file
411
test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
Normal file
@ -0,0 +1,411 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
#include "cutlass/arch/wmma.h"
|
||||
|
||||
#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
|
||||
#include "mma_pipelined_testbed.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
|
||||
|
||||
/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
|
||||
/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// WMMA Floating point (f16 accumulation) - Single stage - Threadblock level tests ////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 32);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 64);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(2, 2);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16 (wmma native size 8x32x16)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = cutlass::half_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// WMMA Floating point (f32 accumulation) - Single stage - Threadblock level tests ////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32 (wmma native size 16x16x16)
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(128, 128, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(256, 256, 96);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(2, 2);
|
||||
dim3 block(32, 4, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32 (wmma native size 32x8x16)
|
||||
////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
|
||||
/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32 (wmma native size 8x32x16)
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::half_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = float;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
|
||||
ElementB, LayoutB, ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED
|
||||
331
test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
Normal file
331
test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
Normal file
@ -0,0 +1,331 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Unit tests for thread-level GEMM
|
||||
*/
|
||||
#include "cutlass/arch/wmma.h"
|
||||
|
||||
#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
|
||||
#include "mma_pipelined_testbed.h"
|
||||
#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
|
||||
|
||||
/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
|
||||
/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/// Integer (s8 and u8) WMMA threadblock level tests ////
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using LayoutA = cutlass::layout::ColumnMajor;
|
||||
using ElementB = int8_t;
|
||||
using LayoutB = cutlass::layout::RowMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.0f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadblockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// SUBBYTE (s4 and b1) WMMA threadblock level tests ////
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
|
||||
using ElementA = cutlass::int4b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::int4b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 128);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
|
||||
using ElementA = cutlass::int4b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::int4b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 64);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::uint1b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages,
|
||||
cutlass::arch::OpXorPopc>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
|
||||
TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
|
||||
using ElementA = cutlass::uint1b_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using ElementB = cutlass::uint1b_t;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using ElementC = int32_t;
|
||||
using LayoutC = cutlass::layout::ColumnMajor;
|
||||
static const int kStages = 1;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
|
||||
|
||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 0.f;
|
||||
|
||||
// Define the MmaCore components
|
||||
using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
|
||||
ThreadBlockShape, WarpShape, InstructionShape,
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
cutlass::arch::OpClassWmmaTensorOp, kStages,
|
||||
cutlass::arch::OpXorPopc>;
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
|
||||
problem_size.k(), alpha, beta)
|
||||
.run(grid, block);
|
||||
}
|
||||
#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
|
||||
|
||||
#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED
|
||||
Reference in New Issue
Block a user