Checkpointing CUTLASS 1.1 release.

This commit is contained in:
akerr
2018-09-18 16:58:03 -07:00
parent cf0301e00f
commit 461f417b9d
193 changed files with 29496 additions and 4771 deletions

View File

@ -0,0 +1,162 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <curand_kernel.h>
#include "cutlass/cutlass.h"
namespace cutlass {
namespace reference {
namespace device {
namespace kernel {
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Kernel to initialize tensor to uniform random distribution
template <typename T>
__global__ void TensorInitializeUniform(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
double range = dist.uniform.max - dist.uniform.min;
double rnd = curand_uniform(&rng_state[threadIdx.x]);
rnd = dist.uniform.min + range * rnd;
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
if (dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << dist.int_scale)));
*tensor = T(rnd / double(1 << dist.int_scale));
} else {
*tensor = T(rnd);
}
tensor += ldm;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Kernel to initialize tensor to uniform distribution
template <typename T>
__global__ void TensorInitializeGaussian(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
double rnd = curand_normal(&rng_state[threadIdx.x]);
rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
if (dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << dist.int_scale)));
*tensor = T(rnd / double(1 << dist.int_scale));
} else {
*tensor = T(rnd);
}
}
}
}
/// Kernel to initialize tensor to an identity matrix
template <typename T>
__global__ void TensorInitializeLinear(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
*tensor =
dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
}
}
}
/// Kernel to initialize tensor to an identity matrix
template <typename T>
__global__ void TensorInitializeIdentity(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
*tensor = (c_idx == s_idx ? T(1) : T(0));
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace device
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,112 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/coord.h"
namespace cutlass {
namespace reference {
namespace device {
namespace kernel {
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines several helpers
namespace detail {
/// Helper to perform for-each operation
template <typename Func, int Rank, int RankRemaining>
struct TensorForEachHelper {
/// Constructor for general rank
__inline__ __device__
TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
int64_t product = 1;
CUTLASS_PRAGMA_UNROLL
for (int i = Rank - RankRemaining; i < Rank; ++i) {
product *= size[i];
}
coord[Rank - 1 - RankRemaining] = index / product;
int64_t remaining = index % product;
TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
}
};
/// Helper to perform for-each operation
template <typename Func, int Rank>
struct TensorForEachHelper<Func, Rank, 0> {
/// Constructor for fastest chaning rank
__inline__ __device__
TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
coord[Rank - 1] = index;
if (coord < size) {
func(coord);
}
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to perform for-each operation
template <typename Func, int Rank, typename Params>
__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
Func func(params);
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
int64_t max_index = 1;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < Rank; ++i) {
max_index *= size[i];
}
CUTLASS_PRAGMA_NO_UNROLL
while (index < max_index) {
Coord<Rank> coord;
detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index);
index += blockDim.x * gridDim.x;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace device
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,772 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Defines device-side elementwise operations on TensorView. Note, the operations defined
in this header are not specialized for any particular data layout and are therefore not
intended to offer the best possible performance. Rather, they are intended to be generic
reference implementations to support the CUTLASS unit tests.
*/
#pragma once
// Standard Library includes
#include <fstream>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>
// CUDA includes
#include <cublas_v2.h>
#include <curand_kernel.h>
// Cutlass includes
#include "cutlass/cutlass.h"
#include "tools/util/device_memory.h"
#include "tools/util/distribution.h"
#include "tools/util/type_traits.h"
#include "tools/util/host_tensor.h"
#include "tools/util/reference/device/tensor_foreach.h"
namespace cutlass {
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace reference {
namespace device {
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Computes a random uniform distribution
template <typename View_>
struct RandomUniformFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
/// Parameters structure
struct Params {
/// View object
View view;
/// RNG seed
int64_t seed;
/// Distriubtion
Distribution dist;
/// Default ctor
CUTLASS_HOST_DEVICE
Params() { }
/// Constructor
CUTLASS_HOST_DEVICE
Params(
View const &view,
int64_t seed,
Distribution dist
): view(view), seed(seed), dist(dist) { }
};
//
// Data members
//
/// Parameters object
Params params;
/// RNG state object
curandState_t rng_state;
//
// Methods
//
/// Device-side initialization of RNG
CUTLASS_DEVICE
RandomUniformFunc(Params const &params): params(params) {
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(params.seed, gtid, 0, &rng_state);
}
/// Compute random value and update RNG state
CUTLASS_DEVICE
void operator()(TensorCoord const &coord) {
double range = params.dist.uniform.max - params.dist.uniform.min;
double rnd = curand_uniform(&rng_state);
rnd = params.dist.uniform.min + range * rnd;
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
T result;
if (params.dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
result = T(rnd / double(1 << params.dist.int_scale));
}
else {
result = T(rnd);
}
params.view.at(coord) = result;
}
};
/// Computes a random Gaussian distribution
template <typename View_>
struct RandomGaussianFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
/// Parameters structure
struct Params {
/// View object
View view;
/// RNG seed
int64_t seed;
/// RNG distribution
Distribution dist;
/// Default ctor
CUTLASS_HOST_DEVICE
Params() { }
/// Constructor
CUTLASS_HOST_DEVICE
Params(
View const &view,
int64_t seed,
Distribution dist
): view(view), seed(seed), dist(dist) { }
};
//
// Data members
//
/// Parameters object
Params params;
/// RNG state object
curandState_t rng_state;
//
// Methods
//
/// Device-side initialization of RNG
CUTLASS_DEVICE
RandomGaussianFunc(Params const &params): params(params) {
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(params.seed, gtid, 0, &rng_state);
}
/// Compute random value and update RNG state
CUTLASS_DEVICE
void operator()(TensorCoord const &coord) {
double rnd = curand_normal(&rng_state);
rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
T result;
if (params.dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
result = T(rnd / double(1 << params.dist.int_scale));
}
else {
result = T(rnd);
}
params.view.at(coord) = result;
}
};
/// Computes a linear combination of each element
template <typename View_>
struct LinearCombinationFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
//
// Data members
//
/// TensorView object
View view;
/// Delta
Coord<View::kRank, double> delta;
/// Offset
double offset;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
LinearCombinationFunc(
View const &view,
Distribution dist
): view(view) {
offset = dist.linear.offset;
if (View::kRank >= 1) {
delta[View::kRank - 1] = dist.linear.delta_column;
}
if (View::kRank >= 2) {
delta[View::kRank - 2] = dist.linear.delta_row;
}
// Additional ranks have delta of zero
for (int i = View::kRank - 2; i > 0; --i) {
delta[i - 1] = 0;
}
}
/// Compute linear combination
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
double result = offset;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < View::kRank; ++i) {
result += delta[i] * double(coord[i]);
}
view.at(coord) = T(result);
}
};
/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
template <typename View_>
struct IdentityFunc {
/// TensorView
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
//
// Data members
//
/// View object
View view;
/// Default ctor
CUTLASS_HOST_DEVICE
IdentityFunc(View const &view): view(view) { }
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
bool equal = true;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < View::kRank; ++i) {
if (coord[i] != coord[0]) {
equal = false;
}
}
view.at(coord) = equal ? T(1) : T(0);
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Initializes a tensor randomly or procedurally.
template <typename View>
void TensorInitialize(View const &view,
int64_t seed,
Distribution const &dist) {
typedef typename View::Storage Scalar;
switch (dist.kind) {
case Distribution::Uniform:
{
typedef detail::RandomUniformFunc<View> Func;
typedef typename Func::Params Params;
TensorForEach<Func, View::kRank, Params>(
view.size(),
Params(view, seed, dist)
);
}
break;
case Distribution::Gaussian:
{
typedef detail::RandomGaussianFunc<View> Func;
typedef typename Func::Params Params;
TensorForEach<Func, View::kRank, Params>(
view.size(),
Params(view, seed, dist)
);
}
break;
case Distribution::Linear:
{
typedef detail::LinearCombinationFunc<View> Func;
TensorForEach<Func, View::kRank, Func>(
view.size(),
Func(view, dist));
}
break;
case Distribution::Identity:
{
typedef detail::IdentityFunc<View> Func;
Func func(view);
TensorForEach<Func, View::kRank, Func>(view.size(), func);
}
break;
default:
break;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace reference
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Dispatcher to appropriate initialization kernel - preserved for backwards compatibility
template <typename T>
inline void tensor_initialize(Distribution const &dist,
int64_t seed,
int dim_contiguous,
int dim_strided,
T *tensor,
int ldm) {
TensorView<T, 2> view(tensor, make_Coord(ldm, 1), make_Coord(dim_strided, dim_contiguous));
reference::device::TensorInitialize(view, seed, dist);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace reference {
namespace device {
namespace detail {
/// Compares two tensor views of equal rank and dimension.
template <typename ViewL, typename ViewR>
struct TensorEqualsFunc {
/// Storage type
typedef typename ViewL::Storage T;
/// Unsigned integer type of same size as View type
typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
/// Assertions
static_assert(ViewL::kRank == ViewR::kRank,
"Cannot compare tensors of different rank");
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// View of right-hand-side tensor
ViewR rhs;
/// Pointer to result scalar - only written with 0 if values are incorrect
int *result;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
/// Equality check
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
if (_lhs != _rhs) {
*result = 0;
}
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Returns true if two tensor views are equal.
template <typename ViewL, typename ViewR>
bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
// Sizes must be identical
if (lhs.size() != rhs.size()) {
return false;
}
// Allocate device memory to contain result of kernel reduction
HostTensor<int, 1> result(1);
result.fill(1);
result.sync_device();
typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
Func func(lhs, rhs, result.device_data());
TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
result.sync_host();
return result.at(0) != 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to apply a binary operator in place
template <typename ViewL, typename ViewR, typename BinaryFunc>
struct TensorFuncBinaryOp {
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// View of right-hand-side tensor
ViewR rhs;
/// Binary function applied to each element
BinaryFunc func;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
TensorFuncBinaryOp(
ViewL const &lhs,
ViewR const &rhs,
BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
/// Equality check
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Helper to apply a binary operator in place
template <typename ViewL, typename ViewR>
struct TensorFillFunc {
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
/// Destination element type
typedef typename ViewL::Storage DestType;
/// Source element type
typedef typename ViewR::Storage SrcType;
/// Parameters object
struct Params {
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// View of right-hand-side tensor
ViewR rhs;
/// Source offset coordinate
TensorCoord source_offset;
/// Size of the subtensor copied from the source
TensorCoord source_size;
/// Offset in destination
TensorCoord dest_offset;
//
// Methods
//
/// Constructs a parameters object for filling a tensor
Params(
ViewL const &lhs,
ViewR const &rhs,
TensorCoord const &source_offset = TensorCoord()
):
lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(rhs.size() - source_offset) { }
/// Constructs a parameters object for filling a tensor
Params(
ViewL const &lhs,
ViewR const &rhs,
TensorCoord const &source_offset,
TensorCoord const &source_size,
TensorCoord const &dest_offset = TensorCoord()
):
lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(source_size), dest_offset(dest_offset) { }
};
//
// Data members
//
Params params;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
TensorFillFunc(
Params const &params): params(params) { }
/// Equality check
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
TensorCoord dst_coord = params.dest_offset + coord;
TensorCoord src_coord = params.source_offset + coord;
if (dst_coord < params.lhs.size() && src_coord < params.rhs.size()) {
params.lhs.at(dst_coord) = DestType(params.rhs.at(src_coord));
}
}
};
} // namespace detail
/// Fills a TensorView with the elements from another TensorView
template <typename ViewL, typename ViewR>
void TensorFill(
ViewL lhs,
ViewR rhs,
typename ViewL::TensorCoord const &source_offset,
typename ViewL::TensorCoord const &source_size,
typename ViewL::TensorCoord const &dest_offset) {
typedef typename ViewL::TensorCoord TensorCoord;
TensorCoord dst_size = lhs.size() - dest_offset;
TensorCoord src_size = rhs.size() - source_offset;
TensorCoord fill_size = dst_size.clamp(src_size);
// Fill function
typedef detail::TensorFillFunc<ViewL, ViewR> Func;
typedef typename Func::Params Params;
Params params(lhs, rhs, source_offset, source_size, dest_offset);
TensorForEach<Func, ViewL::kRank, Params>(fill_size, params);
}
/// Fills a TensorView with the elements from another TensorView
template <typename ViewL, typename ViewR>
void TensorFill(
ViewL lhs,
ViewR rhs,
typename ViewL::TensorCoord const &source_offset = typename ViewL::TensorCoord()) {
typedef typename ViewL::TensorCoord TensorCoord;
TensorFill(lhs, rhs, source_offset, rhs.size(), TensorCoord());
}
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Helper to apply a binary operator in place
template <typename ViewL>
struct TensorFillElementFunc {
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
/// Destination element type
typedef typename ViewL::Storage DestType;
/// Parameters object
struct Params {
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// Source offset coordinate
TensorCoord offset;
/// Element to overwrite with
DestType value;
//
// Methods
//
/// Constructs a parameters object for filling a tensor
CUTLASS_HOST_DEVICE
Params(
ViewL const &lhs,
DestType const &value,
TensorCoord const &offset = TensorCoord()
):
lhs(lhs), value(value), offset(offset) { }
};
//
// Data members
//
Params params;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
TensorFillElementFunc(
Params const &params): params(params) { }
/// Equality check
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
TensorCoord dst_coord = params.offset + coord;
if (dst_coord < params.size) {
params.lhs.at(dst_coord) = params.value;
}
}
};
} // namespace detail
/// Method to perform the actual fill
template <typename ViewL>
void TensorFillElement(
ViewL const &lhs,
typename ViewL::Storage const &value,
typename ViewL::TensorCoord const &offset,
typename ViewL::TensorCoord const &size) {
// Fill function
typedef detail::TensorFillElementFunc<ViewL> Func;
typedef typename Func::Params Params;
Params params(lhs, value, offset);
TensorForEach<Func, ViewL::kRank, Params>(size, params);
}
/// Fills a tensor
template <typename ViewL>
void TensorFillElement(
ViewL lhs,
typename ViewL::Storage value,
typename ViewL::TensorCoord const &offset =typename ViewL::Storage()) {
TensorFillElement(lhs, value, offset, lhs.size() - offset);
}
/// Constructs a parameters object for filling a tensor
template <typename ViewL>
void TensorFillElement(
ViewL lhs,
typename ViewL::Storage value,
typename ViewL::Storage const &offset,
typename ViewL::Storage const &size) {
TensorFillElement(lhs, value, offset, size);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,72 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <stdexcept>
#include "cutlass/cutlass.h"
#include "tools/util/reference/device/kernel/tensor_foreach.h"
namespace cutlass {
namespace reference {
namespace device {
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Launches a kernel for each element in a tensor's index space.
template <typename Func, int Rank, typename Params>
struct TensorForEach {
/// Constructor performs the operation.
TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
if (!grid_size || !block_size) {
// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
&grid_size,
&block_size,
reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
if (result != cudaSuccess) {
throw std::runtime_error("Failed to query occupancy.");
}
// Limit block size. This has the effect of increasing the number of items processed by a
// single thread and reduces the impact of initialization overhead.
block_size = (block_size < 128 ? block_size : 128);
}
dim3 grid(grid_size, 1, 1);
dim3 block(block_size, 1, 1);
kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace reference
} // namesace cutlass

View File

@ -0,0 +1,270 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Reference implementation for GEMM in host-side code.
*/
#pragma once
#include "cutlass/coord.h"
#include "cutlass/matrix_traits.h"
#include "cutlass/tensor_view.h"
#include "cutlass/gemm/gemm_coord.h"
namespace cutlass {
namespace reference {
namespace host {
////////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Template function to compute an inner product.
template <typename Atype, typename Btype, typename Ctype>
Ctype inner_product(Atype a, Btype b, Ctype c) {
return Ctype(a) * Ctype(b) + c;
}
/// Specialization for matrix multiplication with binary operands
template <>
inline int inner_product<Vector<bin1_t, 32>, Vector<bin1_t, 32>, int>(
Vector<bin1_t, 32> a,
Vector<bin1_t, 32> b,
int c) {
int accum = 0;
for (int bit = 0; bit < 32; bit++) {
accum += a[bit] ^ b[bit];
}
return accum + c;
}
/// Specialization for matrix multiplication with signed 4-bit integer operands
template <> inline
int inner_product<Vector<int4_t, 8>, Vector<int4_t, 8>, int>(
Vector<int4_t, 8> a,
Vector<int4_t, 8> b,
int c) {
int accum = 0;
for (int k = 0; k < 8; k++) {
accum += a[k] * b[k];
}
return accum + c;
}
/// Specialization for matrix multiplication with unsigned 4-bit integer operands
template <> inline
int inner_product<Vector<uint4_t, 8>, Vector<uint4_t, 8>, int>(
Vector<uint4_t, 8> a,
Vector<uint4_t, 8> b,
int c) {
int accum = 0;
for (int k = 0; k < 8; k++) {
accum += a[k] * b[k];
}
return accum + c;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename SrcType, typename DstType>
struct Cast {
// Default behavior: convert to the destination type
static inline DstType apply(SrcType src) { return static_cast<DstType>(src); };
};
template <>
struct Cast<float, int8_t> {
static inline int8_t apply(float src) {
// Clamp to the range of signed 8-bit integers.
return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
};
};
template <>
struct Cast<float, uint8_t> {
static inline uint8_t apply(float src) {
// Clamp to the range of signed 8-bit integers.
return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
};
};
} // namespace detail
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
/// objects.
///
/// Explicitly naming types needed by this template can be cumbersome, particularly for the
/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
/// AccumulatorType(0) as the last function argument can be easier than naming all template
/// arguments explicitly.
template <
typename TensorRefA,
typename TensorRefB,
typename TensorRefC,
typename ScalarType,
typename AccumulatorType
>
void Gemm(
gemm::GemmCoord problem_size,
ScalarType alpha,
TensorRefA tensor_a,
TensorRefB tensor_b,
ScalarType beta,
TensorRefC tensor_c,
AccumulatorType initial_accum) {
typedef typename TensorRefA::Storage AType;
typedef typename TensorRefB::Storage BType;
typedef typename TensorRefC::Storage CType;
static_assert(
TensorRefA::kRank == 2 &&
TensorRefB::kRank == 2 &&
TensorRefC::kRank == 2, "Tensors must be of rank 2");
// Note: batch is ignored.
int const M = problem_size.m();
int const N = problem_size.n();
int const K = problem_size.k();
// Blocking necessary to speedup reference implementation
int const Mblock = 32;
int const Nblock = 32;
for (int row_block = 0; row_block < M; row_block += Mblock) {
for (int col_block = 0; col_block < N; col_block += Nblock) {
AccumulatorType accum[Mblock][Nblock];
for (int j = 0; j < Nblock; j++) {
for (int i = 0; i < Mblock; i++) {
accum[i][j] = initial_accum;
}
}
for (int k_block = 0; k_block < K; ++k_block) {
for (int j = 0; j < Nblock; j++) {
for (int i = 0; i < Mblock; i++) {
int row = row_block + i;
int col = col_block + j;
if (row < M && col < N) {
AType a = tensor_a.at(MatrixCoord(row, k_block));
BType b = tensor_b.at(MatrixCoord(k_block, col));
accum[i][j] = detail::inner_product(a, b, accum[i][j]);
}
}
}
}
for (int j = 0; j < Nblock; j++) {
for (int i = 0; i < Mblock; i++) {
int row = row_block + i;
int col = col_block + j;
MatrixCoord coord = MatrixCoord(row, col);
if (row < M && col < N) {
tensor_c.at(coord) = detail::Cast<ScalarType, CType>::apply(
alpha * ScalarType(accum[i][j]) +
beta * ScalarType(tensor_c.at(coord)));
}
}
}
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
/// objects.
///
/// This assumes the accumulator type is the same type as the scalars.
template <
typename TensorRefA,
typename TensorRefB,
typename TensorRefC,
typename ScalarType
>
void Gemm(
gemm::GemmCoord problem_size,
ScalarType alpha,
TensorRefA tensor_a,
TensorRefB tensor_b,
ScalarType beta,
TensorRefC tensor_c) {
Gemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Computes a batch of GEMMs over a set of matrices of common dimension.
template <
typename TensorRefCollectionA,
typename TensorRefCollectionB,
typename TensorRefCollectionC,
typename ScalarType,
typename AccumulatorType
>
void BatchGemm(
gemm::GemmCoord problem_size,
ScalarType alpha,
TensorRefCollectionA const& tensor_a,
TensorRefCollectionB const& tensor_b,
ScalarType beta,
TensorRefCollectionC &tensor_c,
AccumulatorType initial_accum = AccumulatorType(0)) {
typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
for (int batch = 0;
batch < problem_size.batch();
++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
Gemm(
problem_size,
alpha,
*tensor_a_it,
*tensor_b_it,
beta,
*tensor_c_it,
initial_accum);
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace host
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,478 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Defines host-side elementwise operations on TensorView.
*/
#pragma once
// Standard Library includes
#include <fstream>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>
#include <cstdlib>
#include <cmath>
// Cutlass includes
#include "cutlass/cutlass.h"
#include "tools/util/distribution.h"
#include "tools/util/type_traits.h"
#include "tools/util/reference/host/tensor_foreach.h"
namespace cutlass {
namespace reference {
namespace host {
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Computes a random uniform distribution
template <typename View_>
struct RandomUniformFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
/// Parameters structure
struct Params {
/// View object
View view;
/// RNG seed
unsigned seed;
/// Distriubtion
Distribution dist;
/// Default ctor
Params() { }
/// Constructor
Params(
View const &view,
unsigned seed,
Distribution dist
): view(view), seed(seed), dist(dist) { }
};
//
// Data members
//
/// Parameters object
Params params;
//
// Methods
//
/// Device-side initialization of RNG
RandomUniformFunc(Params const &params): params(params) {
std::srand(params.seed);
}
/// Compute random value and update RNG state
void operator()(TensorCoord const &coord) {
double range = params.dist.uniform.max - params.dist.uniform.min;
double rnd = double(std::rand()) / double(RAND_MAX);
rnd = params.dist.uniform.min + range * rnd;
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
T result;
if (params.dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
result = T(rnd / double(1 << params.dist.int_scale));
}
else {
result = T(rnd);
}
params.view.at(coord) = result;
}
};
/// Computes a random Gaussian distribution
template <typename View_>
struct RandomGaussianFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
/// Parameters structure
struct Params {
/// View object
View view;
/// RNG seed
unsigned seed;
/// RNG distribution
Distribution dist;
/// Default ctor
Params() { }
/// Constructor
Params(
View const &view,
unsigned seed,
Distribution dist
): view(view), seed(seed), dist(dist) { }
};
//
// Data members
//
/// Parameters object
Params params;
/// Constant PI
double pi;
//
// Methods
//
/// Device-side initialization of RNG
RandomGaussianFunc(Params const &params): params(params) {
pi = std::acos(-1);
}
/// Compute random value and update RNG state
void operator()(TensorCoord const &coord) {
// Box-Muller transform to generate random numbers with Normal distribution
double u1 = double(std::rand()) / double(RAND_MAX);
double u2 = double(std::rand()) / double(RAND_MAX);
double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
// Scale according to Gaussian distribution parameters
rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
T result;
if (params.dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
result = T(rnd / double(1 << params.dist.int_scale));
}
else {
result = T(rnd);
}
params.view.at(coord) = result;
}
};
/// Computes a linear combination of each element
template <typename View_>
struct LinearCombinationFunc {
/// View type
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
//
// Data members
//
/// TensorView object
View view;
/// Delta
Coord<View::kRank, double> delta;
/// Offset
double offset;
//
// Methods
//
/// Constructor
LinearCombinationFunc(
View const &view,
Distribution dist
): view(view) {
offset = dist.linear.offset;
if (View::kRank >= 1) {
delta[View::kRank - 1] = dist.linear.delta_column;
}
if (View::kRank >= 2) {
delta[View::kRank - 2] = dist.linear.delta_row;
}
// Additional ranks have delta of zero
for (int i = View::kRank - 2; i > 0; --i) {
delta[i - 1] = 0;
}
}
/// Compute linear combination
void operator()(TensorCoord const &coord) {
double result = offset;
for (int i = 0; i < View::kRank; ++i) {
result += delta[i] * double(coord[i]);
}
view.at(coord) = T(result);
}
};
/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
template <typename View_>
struct IdentityFunc {
/// TensorView
typedef View_ View;
/// Scalar type
typedef typename View::Storage T;
/// Coordinate in tensor's index space
typedef typename View::TensorCoord TensorCoord;
//
// Data members
//
/// View object
View view;
/// Default ctor
IdentityFunc(View const &view): view(view) { }
/// Computes an identity
void operator()(TensorCoord const &coord) {
bool equal = true;
for (int i = 0; i < View::kRank; ++i) {
if (coord[i] != coord[0]) {
equal = false;
}
}
view.at(coord) = equal ? T(1) : T(0);
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Initializes a tensor randomly or procedurally.
template <typename View>
void TensorInitialize(View const &view,
unsigned seed,
Distribution const &dist) {
typedef typename View::Storage Scalar;
switch (dist.kind) {
case Distribution::Uniform:
{
typedef detail::RandomUniformFunc<View> Func;
typedef typename Func::Params Params;
TensorForEach<Func, View::kRank, Params>(
view.size(),
Params(view, seed, dist)
);
}
break;
case Distribution::Gaussian:
{
typedef detail::RandomGaussianFunc<View> Func;
typedef typename Func::Params Params;
TensorForEach<Func, View::kRank, Params>(
view.size(),
Params(view, seed, dist)
);
}
break;
case Distribution::Linear:
{
typedef detail::LinearCombinationFunc<View> Func;
TensorForEach<Func, View::kRank, Func>(
view.size(),
Func(view, dist));
}
break;
case Distribution::Identity:
{
typedef detail::IdentityFunc<View> Func;
Func func(view);
TensorForEach<Func, View::kRank, Func>(view.size(), func);
}
break;
default:
break;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
/// Compares two tensor views of equal rank and dimension.
template <typename ViewL, typename ViewR>
struct TensorEqualsFunc {
/// Storage type
typedef typename ViewL::Storage T;
/// Unsigned integer type of same size as View type
typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
/// Assertions
static_assert(ViewL::kRank == ViewR::kRank,
"Cannot compare tensors of different rank");
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// View of right-hand-side tensor
ViewR rhs;
/// Pointer to result scalar - only written with 0 if values are incorrect
int *result;
//
// Methods
//
/// Constructor
TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
/// Equality check
void operator()(TensorCoord const &coord) {
UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
if (_lhs != _rhs) {
*result = 0;
}
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Returns true if two tensor views are equal.
template <typename ViewL, typename ViewR>
bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
// Sizes must be identical
if (lhs.size() != rhs.size()) {
return false;
}
int result = 1;
typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
Func func(lhs, rhs, &result);
TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
return result != 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to apply a binary operator in place
template <typename ViewL, typename ViewR, typename BinaryFunc>
struct TensorFuncBinaryOp {
/// Coordinate in tensor's index space
typedef typename ViewL::TensorCoord TensorCoord;
//
// Data members
//
/// View of left-hand-side tensor
ViewL lhs;
/// View of right-hand-side tensor
ViewR rhs;
/// Binary function applied to each element
BinaryFunc func;
//
// Methods
//
/// Constructor
TensorFuncBinaryOp(
ViewL const &lhs,
ViewR const &rhs,
BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
/// Equality check
void operator()(TensorCoord const &coord) {
lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace host
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,102 @@
/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <stdexcept>
#include "cutlass/cutlass.h"
#include "tools/util/reference/device/kernel/tensor_foreach.h"
namespace cutlass {
namespace reference {
namespace host {
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines several helpers
namespace detail {
/// Helper to perform for-each operation
template <typename Func, int Rank, int RankRemaining>
struct TensorForEachHelper {
/// Index of the active rank
static int const kActiveRank = Rank - RankRemaining - 1;
/// Constructor for general rank
TensorForEachHelper(
Func &func,
Coord<Rank> const &size,
Coord<Rank> &coord) {
for (int i = 0; i < size.at(kActiveRank); ++i) {
coord[kActiveRank] = i;
TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, size, coord);
}
}
};
/// Helper to perform for-each operation
template <typename Func, int Rank>
struct TensorForEachHelper<Func, Rank, 0> {
/// Index of the active rank
static int const kActiveRank = Rank - 1;
/// Constructor for fastest chaning rank
TensorForEachHelper(
Func &func,
Coord<Rank> const &size,
Coord<Rank> &coord) {
for (int i = 0; i < size.at(kActiveRank); ++i) {
coord[kActiveRank] = i;
func(coord);
}
}
};
} // namespace detail
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Iterates over the index space of a tensor
template <typename Func, int Rank, typename Params>
struct TensorForEach {
/// Constructor performs the operation.
TensorForEach(Coord<Rank> size, Params params = Params()) {
Func func(params);
Coord<Rank> coord;
detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace host
} // namespace reference
} // namespace cutlass