CUTLASS 2.0 (#62)
CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
This commit is contained in:
228
include/cutlass/reduction/thread/reduce.h
Normal file
228
include/cutlass/reduction/thread/reduce.h
Normal file
@ -0,0 +1,228 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Defines basic thread level reduction with specializations for Array<T, N>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/half.h"
|
||||
#include "cutlass/functional.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace thread {
|
||||
|
||||
/// Structure to compute the thread level reduction
|
||||
template <typename Op, typename T>
|
||||
struct Reduce;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial Specialization of Reduce for "plus" (a functional operator)
|
||||
template <typename T>
|
||||
struct Reduce< plus<T>, T > {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
T operator()(T lhs, T const &rhs) const {
|
||||
plus<T> _op;
|
||||
return _op(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization of Reduce for Array<T, N>
|
||||
template <typename T, int N>
|
||||
struct Reduce < plus<T>, Array<T, N>> {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, 1> operator()(Array<T, N> const &in) const {
|
||||
|
||||
Array<T, 1> result;
|
||||
Reduce< plus<T>, T > scalar_reduce;
|
||||
result.clear();
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
result[0] = scalar_reduce(result[0], in[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specializations of Reduce for Array<half_t, N>
|
||||
template <int N>
|
||||
struct Reduce < plus<half_t>, Array<half_t, N> > {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<half_t, 1> operator()(Array<half_t, N> const &input) {
|
||||
|
||||
Array<half_t, 1> result;
|
||||
|
||||
// If there is only 1 element - there is nothing to reduce
|
||||
if( N ==1 ){
|
||||
|
||||
result[0] = input.front();
|
||||
|
||||
} else {
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
|
||||
|
||||
__half result_d;
|
||||
Array<half_t, 1> const *in_ptr_half = reinterpret_cast<Array<half_t, 1> const *>(&input);
|
||||
Array<half_t, 2> const *in_ptr_half2 = reinterpret_cast<Array<half_t, 2> const *>(&input);
|
||||
__half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
|
||||
|
||||
// Set initial result = first half2, in case N==2
|
||||
__half2 tmp_result = x_in_half2[0];
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 1; i < N/2; ++i) {
|
||||
|
||||
tmp_result = __hadd2(x_in_half2[i], tmp_result);
|
||||
|
||||
}
|
||||
|
||||
result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
|
||||
|
||||
// One final step is needed for odd "N" (to add the (N-1)th element)
|
||||
if( N%2 ){
|
||||
|
||||
__half last_element;
|
||||
Array<half_t, 1> tmp_last;
|
||||
Array<half_t, 1> *tmp_last_ptr = &tmp_last;
|
||||
tmp_last_ptr[0] = in_ptr_half[N-1];
|
||||
last_element = reinterpret_cast<__half const &>(tmp_last);
|
||||
|
||||
result_d = __hadd(result_d, last_element);
|
||||
|
||||
}
|
||||
|
||||
Array<half_t, 1> *result_ptr = &result;
|
||||
*result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
|
||||
|
||||
#else
|
||||
|
||||
Reduce< plus<half_t>, half_t > scalar_reduce;
|
||||
result.clear();
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
|
||||
result[0] = scalar_reduce(result[0], input[i]);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specializations of Reduce for AlignedArray<half_t, N>
|
||||
template <int N>
|
||||
struct Reduce < plus<half_t>, AlignedArray<half_t, N> > {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<half_t, 1> operator()(AlignedArray<half_t, N> const &input) {
|
||||
|
||||
Array<half_t, 1> result;
|
||||
|
||||
// If there is only 1 element - there is nothing to reduce
|
||||
if( N ==1 ){
|
||||
|
||||
result[0] = input.front();
|
||||
|
||||
} else {
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
|
||||
|
||||
__half result_d;
|
||||
AlignedArray<half_t, 1> const *in_ptr_half = reinterpret_cast<AlignedArray<half_t, 1> const *>(&input);
|
||||
AlignedArray<half_t, 2> const *in_ptr_half2 = reinterpret_cast<AlignedArray<half_t, 2> const *>(&input);
|
||||
__half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
|
||||
|
||||
// Set initial result = first half2, in case N==2
|
||||
__half2 tmp_result = x_in_half2[0];
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 1; i < N/2; ++i) {
|
||||
|
||||
tmp_result = __hadd2(x_in_half2[i], tmp_result);
|
||||
|
||||
}
|
||||
|
||||
result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
|
||||
|
||||
// One final step is needed for odd "N" (to add the (N-1)th element)
|
||||
if( N%2 ){
|
||||
|
||||
__half last_element;
|
||||
AlignedArray<half_t, 1> tmp_last;
|
||||
AlignedArray<half_t, 1> *tmp_last_ptr = &tmp_last;
|
||||
tmp_last_ptr[0] = in_ptr_half[N-1];
|
||||
last_element = reinterpret_cast<__half const &>(tmp_last);
|
||||
|
||||
result_d = __hadd(result_d, last_element);
|
||||
|
||||
}
|
||||
|
||||
Array<half_t, 1> *result_ptr = &result;
|
||||
*result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
|
||||
|
||||
#else
|
||||
|
||||
Reduce< plus<half_t>, half_t > scalar_reduce;
|
||||
result.clear();
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
|
||||
result[0] = scalar_reduce(result[0], input[i]);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
96
include/cutlass/reduction/thread/reduction_operators.h
Normal file
96
include/cutlass/reduction/thread/reduction_operators.h
Normal file
@ -0,0 +1,96 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over densely packed tensors in global memory
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace thread {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Mixed-precision reduction
|
||||
template <
|
||||
typename ElementAccumulator_,
|
||||
typename Element_,
|
||||
int Count = 1
|
||||
>
|
||||
struct ReduceAdd {
|
||||
|
||||
//
|
||||
// Type definitions
|
||||
//
|
||||
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using Element = Element_;
|
||||
static int const kCount = Count;
|
||||
|
||||
using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
|
||||
using FragmentElement = cutlass::Array<Element, kCount>;
|
||||
|
||||
struct Params { };
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Parameters object
|
||||
Params params;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
ReduceAdd(Params params_ = Params()): params(params_) { }
|
||||
|
||||
/// Operator
|
||||
CUTLASS_HOST_DEVICE
|
||||
FragmentAccumulator operator()(
|
||||
FragmentAccumulator accumulator,
|
||||
FragmentElement element) const {
|
||||
|
||||
plus<FragmentAccumulator> op;
|
||||
|
||||
return op(accumulator, element);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace thread
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
Reference in New Issue
Block a user