Checkpointing CUTLASS 1.1 release.
This commit is contained in:
162
tools/util/reference/device/kernel/tensor_elementwise.h
Normal file
162
tools/util/reference/device/kernel/tensor_elementwise.h
Normal file
@ -0,0 +1,162 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace device {
|
||||
namespace kernel {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel to initialize tensor to uniform random distribution
|
||||
template <typename T>
|
||||
__global__ void TensorInitializeUniform(
|
||||
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
|
||||
__shared__ curandState_t rng_state[1024];
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
|
||||
|
||||
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
|
||||
|
||||
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_idx = blockIdx.y * blockDim.x;
|
||||
|
||||
tensor += s_idx * ldm + c_idx;
|
||||
|
||||
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
|
||||
if (s_idx < dim_strided && c_idx < dim_contiguous) {
|
||||
double range = dist.uniform.max - dist.uniform.min;
|
||||
|
||||
double rnd = curand_uniform(&rng_state[threadIdx.x]);
|
||||
|
||||
rnd = dist.uniform.min + range * rnd;
|
||||
|
||||
// Random values are cast to integer after scaling by a power of two to facilitate error
|
||||
// testing
|
||||
if (dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << dist.int_scale)));
|
||||
*tensor = T(rnd / double(1 << dist.int_scale));
|
||||
} else {
|
||||
*tensor = T(rnd);
|
||||
}
|
||||
|
||||
tensor += ldm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel to initialize tensor to uniform distribution
|
||||
template <typename T>
|
||||
__global__ void TensorInitializeGaussian(
|
||||
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
|
||||
__shared__ curandState_t rng_state[1024];
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
|
||||
|
||||
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
|
||||
|
||||
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_idx = blockIdx.y * blockDim.x;
|
||||
|
||||
tensor += s_idx * ldm + c_idx;
|
||||
|
||||
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
|
||||
if (s_idx < dim_strided && c_idx < dim_contiguous) {
|
||||
// Random values are cast to integer after scaling by a power of two to facilitate error
|
||||
// testing
|
||||
|
||||
double rnd = curand_normal(&rng_state[threadIdx.x]);
|
||||
|
||||
rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
|
||||
|
||||
if (dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << dist.int_scale)));
|
||||
*tensor = T(rnd / double(1 << dist.int_scale));
|
||||
} else {
|
||||
*tensor = T(rnd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Kernel to initialize tensor to an identity matrix
|
||||
template <typename T>
|
||||
__global__ void TensorInitializeLinear(
|
||||
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
|
||||
__shared__ curandState_t rng_state[1024];
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
|
||||
|
||||
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
|
||||
|
||||
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_idx = blockIdx.y * blockDim.x;
|
||||
|
||||
tensor += s_idx * ldm + c_idx;
|
||||
|
||||
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
|
||||
if (s_idx < dim_strided && c_idx < dim_contiguous) {
|
||||
*tensor =
|
||||
dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Kernel to initialize tensor to an identity matrix
|
||||
template <typename T>
|
||||
__global__ void TensorInitializeIdentity(
|
||||
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
|
||||
__shared__ curandState_t rng_state[1024];
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
|
||||
|
||||
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
|
||||
|
||||
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_idx = blockIdx.y * blockDim.x;
|
||||
|
||||
tensor += s_idx * ldm + c_idx;
|
||||
|
||||
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
|
||||
if (s_idx < dim_strided && c_idx < dim_contiguous) {
|
||||
*tensor = (c_idx == s_idx ? T(1) : T(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace device
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
112
tools/util/reference/device/kernel/tensor_foreach.h
Normal file
112
tools/util/reference/device/kernel/tensor_foreach.h
Normal file
@ -0,0 +1,112 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/coord.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace device {
|
||||
namespace kernel {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Defines several helpers
|
||||
namespace detail {
|
||||
|
||||
/// Helper to perform for-each operation
|
||||
template <typename Func, int Rank, int RankRemaining>
|
||||
struct TensorForEachHelper {
|
||||
|
||||
/// Constructor for general rank
|
||||
__inline__ __device__
|
||||
TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
|
||||
|
||||
int64_t product = 1;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = Rank - RankRemaining; i < Rank; ++i) {
|
||||
product *= size[i];
|
||||
}
|
||||
|
||||
coord[Rank - 1 - RankRemaining] = index / product;
|
||||
int64_t remaining = index % product;
|
||||
|
||||
TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
|
||||
}
|
||||
};
|
||||
|
||||
/// Helper to perform for-each operation
|
||||
template <typename Func, int Rank>
|
||||
struct TensorForEachHelper<Func, Rank, 0> {
|
||||
|
||||
/// Constructor for fastest chaning rank
|
||||
__inline__ __device__
|
||||
TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
|
||||
|
||||
coord[Rank - 1] = index;
|
||||
|
||||
if (coord < size) {
|
||||
func(coord);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Helper to perform for-each operation
|
||||
template <typename Func, int Rank, typename Params>
|
||||
__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
|
||||
|
||||
Func func(params);
|
||||
|
||||
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int64_t max_index = 1;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < Rank; ++i) {
|
||||
max_index *= size[i];
|
||||
}
|
||||
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
while (index < max_index) {
|
||||
Coord<Rank> coord;
|
||||
|
||||
detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index);
|
||||
index += blockDim.x * gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace device
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
|
||||
772
tools/util/reference/device/tensor_elementwise.h
Normal file
772
tools/util/reference/device/tensor_elementwise.h
Normal file
@ -0,0 +1,772 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Defines device-side elementwise operations on TensorView. Note, the operations defined
|
||||
in this header are not specialized for any particular data layout and are therefore not
|
||||
intended to offer the best possible performance. Rather, they are intended to be generic
|
||||
reference implementations to support the CUTLASS unit tests.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// Standard Library includes
|
||||
#include <fstream>
|
||||
#include <ostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
// CUDA includes
|
||||
#include <cublas_v2.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
// Cutlass includes
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "tools/util/device_memory.h"
|
||||
#include "tools/util/distribution.h"
|
||||
#include "tools/util/type_traits.h"
|
||||
#include "tools/util/host_tensor.h"
|
||||
#include "tools/util/reference/device/tensor_foreach.h"
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace reference {
|
||||
namespace device {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Computes a random uniform distribution
|
||||
template <typename View_>
|
||||
struct RandomUniformFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
/// Parameters structure
|
||||
struct Params {
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// RNG seed
|
||||
int64_t seed;
|
||||
|
||||
/// Distriubtion
|
||||
Distribution dist;
|
||||
|
||||
/// Default ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(
|
||||
View const &view,
|
||||
int64_t seed,
|
||||
Distribution dist
|
||||
): view(view), seed(seed), dist(dist) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Parameters object
|
||||
Params params;
|
||||
|
||||
/// RNG state object
|
||||
curandState_t rng_state;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Device-side initialization of RNG
|
||||
CUTLASS_DEVICE
|
||||
RandomUniformFunc(Params const ¶ms): params(params) {
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
curand_init(params.seed, gtid, 0, &rng_state);
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
CUTLASS_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
double range = params.dist.uniform.max - params.dist.uniform.min;
|
||||
double rnd = curand_uniform(&rng_state);
|
||||
rnd = params.dist.uniform.min + range * rnd;
|
||||
|
||||
// Random values are cast to integer after scaling by a power of two to facilitate error
|
||||
// testing
|
||||
T result;
|
||||
if (params.dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
|
||||
result = T(rnd / double(1 << params.dist.int_scale));
|
||||
}
|
||||
else {
|
||||
result = T(rnd);
|
||||
}
|
||||
|
||||
params.view.at(coord) = result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Computes a random Gaussian distribution
|
||||
template <typename View_>
|
||||
struct RandomGaussianFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
/// Parameters structure
|
||||
struct Params {
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// RNG seed
|
||||
int64_t seed;
|
||||
|
||||
/// RNG distribution
|
||||
Distribution dist;
|
||||
|
||||
/// Default ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(
|
||||
View const &view,
|
||||
int64_t seed,
|
||||
Distribution dist
|
||||
): view(view), seed(seed), dist(dist) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Parameters object
|
||||
Params params;
|
||||
|
||||
/// RNG state object
|
||||
curandState_t rng_state;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Device-side initialization of RNG
|
||||
CUTLASS_DEVICE
|
||||
RandomGaussianFunc(Params const ¶ms): params(params) {
|
||||
|
||||
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
curand_init(params.seed, gtid, 0, &rng_state);
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
CUTLASS_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
double rnd = curand_normal(&rng_state);
|
||||
rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
|
||||
|
||||
T result;
|
||||
if (params.dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
|
||||
result = T(rnd / double(1 << params.dist.int_scale));
|
||||
}
|
||||
else {
|
||||
result = T(rnd);
|
||||
}
|
||||
|
||||
params.view.at(coord) = result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Computes a linear combination of each element
|
||||
template <typename View_>
|
||||
struct LinearCombinationFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// TensorView object
|
||||
View view;
|
||||
|
||||
/// Delta
|
||||
Coord<View::kRank, double> delta;
|
||||
|
||||
/// Offset
|
||||
double offset;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
LinearCombinationFunc(
|
||||
View const &view,
|
||||
Distribution dist
|
||||
): view(view) {
|
||||
|
||||
offset = dist.linear.offset;
|
||||
if (View::kRank >= 1) {
|
||||
delta[View::kRank - 1] = dist.linear.delta_column;
|
||||
}
|
||||
if (View::kRank >= 2) {
|
||||
delta[View::kRank - 2] = dist.linear.delta_row;
|
||||
}
|
||||
// Additional ranks have delta of zero
|
||||
for (int i = View::kRank - 2; i > 0; --i) {
|
||||
delta[i - 1] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute linear combination
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
double result = offset;
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < View::kRank; ++i) {
|
||||
result += delta[i] * double(coord[i]);
|
||||
}
|
||||
view.at(coord) = T(result);
|
||||
}
|
||||
};
|
||||
|
||||
/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
|
||||
template <typename View_>
|
||||
struct IdentityFunc {
|
||||
|
||||
/// TensorView
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// Default ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
IdentityFunc(View const &view): view(view) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
bool equal = true;
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < View::kRank; ++i) {
|
||||
if (coord[i] != coord[0]) {
|
||||
equal = false;
|
||||
}
|
||||
}
|
||||
view.at(coord) = equal ? T(1) : T(0);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initializes a tensor randomly or procedurally.
|
||||
template <typename View>
|
||||
void TensorInitialize(View const &view,
|
||||
int64_t seed,
|
||||
Distribution const &dist) {
|
||||
|
||||
typedef typename View::Storage Scalar;
|
||||
|
||||
switch (dist.kind) {
|
||||
case Distribution::Uniform:
|
||||
{
|
||||
typedef detail::RandomUniformFunc<View> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
TensorForEach<Func, View::kRank, Params>(
|
||||
view.size(),
|
||||
Params(view, seed, dist)
|
||||
);
|
||||
}
|
||||
break;
|
||||
case Distribution::Gaussian:
|
||||
{
|
||||
typedef detail::RandomGaussianFunc<View> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
TensorForEach<Func, View::kRank, Params>(
|
||||
view.size(),
|
||||
Params(view, seed, dist)
|
||||
);
|
||||
}
|
||||
break;
|
||||
case Distribution::Linear:
|
||||
{
|
||||
typedef detail::LinearCombinationFunc<View> Func;
|
||||
TensorForEach<Func, View::kRank, Func>(
|
||||
view.size(),
|
||||
Func(view, dist));
|
||||
}
|
||||
break;
|
||||
case Distribution::Identity:
|
||||
{
|
||||
typedef detail::IdentityFunc<View> Func;
|
||||
|
||||
Func func(view);
|
||||
|
||||
TensorForEach<Func, View::kRank, Func>(view.size(), func);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reference
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Dispatcher to appropriate initialization kernel - preserved for backwards compatibility
|
||||
template <typename T>
|
||||
inline void tensor_initialize(Distribution const &dist,
|
||||
int64_t seed,
|
||||
int dim_contiguous,
|
||||
int dim_strided,
|
||||
T *tensor,
|
||||
int ldm) {
|
||||
|
||||
TensorView<T, 2> view(tensor, make_Coord(ldm, 1), make_Coord(dim_strided, dim_contiguous));
|
||||
reference::device::TensorInitialize(view, seed, dist);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace reference {
|
||||
namespace device {
|
||||
namespace detail {
|
||||
|
||||
/// Compares two tensor views of equal rank and dimension.
|
||||
template <typename ViewL, typename ViewR>
|
||||
struct TensorEqualsFunc {
|
||||
|
||||
/// Storage type
|
||||
typedef typename ViewL::Storage T;
|
||||
|
||||
/// Unsigned integer type of same size as View type
|
||||
typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
/// Assertions
|
||||
static_assert(ViewL::kRank == ViewR::kRank,
|
||||
"Cannot compare tensors of different rank");
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// View of right-hand-side tensor
|
||||
ViewR rhs;
|
||||
|
||||
/// Pointer to result scalar - only written with 0 if values are incorrect
|
||||
int *result;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
|
||||
|
||||
/// Equality check
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
|
||||
UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
|
||||
if (_lhs != _rhs) {
|
||||
*result = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Returns true if two tensor views are equal.
|
||||
template <typename ViewL, typename ViewR>
|
||||
bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
|
||||
|
||||
// Sizes must be identical
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate device memory to contain result of kernel reduction
|
||||
HostTensor<int, 1> result(1);
|
||||
result.fill(1);
|
||||
result.sync_device();
|
||||
|
||||
typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
|
||||
Func func(lhs, rhs, result.device_data());
|
||||
|
||||
TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
|
||||
result.sync_host();
|
||||
|
||||
return result.at(0) != 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Helper to apply a binary operator in place
|
||||
template <typename ViewL, typename ViewR, typename BinaryFunc>
|
||||
struct TensorFuncBinaryOp {
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// View of right-hand-side tensor
|
||||
ViewR rhs;
|
||||
|
||||
/// Binary function applied to each element
|
||||
BinaryFunc func;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorFuncBinaryOp(
|
||||
ViewL const &lhs,
|
||||
ViewR const &rhs,
|
||||
BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
|
||||
|
||||
/// Equality check
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Helper to apply a binary operator in place
|
||||
template <typename ViewL, typename ViewR>
|
||||
struct TensorFillFunc {
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
/// Destination element type
|
||||
typedef typename ViewL::Storage DestType;
|
||||
|
||||
/// Source element type
|
||||
typedef typename ViewR::Storage SrcType;
|
||||
|
||||
/// Parameters object
|
||||
struct Params {
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// View of right-hand-side tensor
|
||||
ViewR rhs;
|
||||
|
||||
/// Source offset coordinate
|
||||
TensorCoord source_offset;
|
||||
|
||||
/// Size of the subtensor copied from the source
|
||||
TensorCoord source_size;
|
||||
|
||||
/// Offset in destination
|
||||
TensorCoord dest_offset;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructs a parameters object for filling a tensor
|
||||
Params(
|
||||
ViewL const &lhs,
|
||||
ViewR const &rhs,
|
||||
TensorCoord const &source_offset = TensorCoord()
|
||||
):
|
||||
lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(rhs.size() - source_offset) { }
|
||||
|
||||
/// Constructs a parameters object for filling a tensor
|
||||
Params(
|
||||
ViewL const &lhs,
|
||||
ViewR const &rhs,
|
||||
TensorCoord const &source_offset,
|
||||
TensorCoord const &source_size,
|
||||
TensorCoord const &dest_offset = TensorCoord()
|
||||
):
|
||||
lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(source_size), dest_offset(dest_offset) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
Params params;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorFillFunc(
|
||||
Params const ¶ms): params(params) { }
|
||||
|
||||
/// Equality check
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
TensorCoord dst_coord = params.dest_offset + coord;
|
||||
TensorCoord src_coord = params.source_offset + coord;
|
||||
|
||||
if (dst_coord < params.lhs.size() && src_coord < params.rhs.size()) {
|
||||
params.lhs.at(dst_coord) = DestType(params.rhs.at(src_coord));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Fills a TensorView with the elements from another TensorView
|
||||
template <typename ViewL, typename ViewR>
|
||||
void TensorFill(
|
||||
ViewL lhs,
|
||||
ViewR rhs,
|
||||
typename ViewL::TensorCoord const &source_offset,
|
||||
typename ViewL::TensorCoord const &source_size,
|
||||
typename ViewL::TensorCoord const &dest_offset) {
|
||||
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
TensorCoord dst_size = lhs.size() - dest_offset;
|
||||
TensorCoord src_size = rhs.size() - source_offset;
|
||||
|
||||
TensorCoord fill_size = dst_size.clamp(src_size);
|
||||
|
||||
// Fill function
|
||||
typedef detail::TensorFillFunc<ViewL, ViewR> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
Params params(lhs, rhs, source_offset, source_size, dest_offset);
|
||||
|
||||
TensorForEach<Func, ViewL::kRank, Params>(fill_size, params);
|
||||
}
|
||||
|
||||
/// Fills a TensorView with the elements from another TensorView
|
||||
template <typename ViewL, typename ViewR>
|
||||
void TensorFill(
|
||||
ViewL lhs,
|
||||
ViewR rhs,
|
||||
typename ViewL::TensorCoord const &source_offset = typename ViewL::TensorCoord()) {
|
||||
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
TensorFill(lhs, rhs, source_offset, rhs.size(), TensorCoord());
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Helper to apply a binary operator in place
|
||||
template <typename ViewL>
|
||||
struct TensorFillElementFunc {
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
/// Destination element type
|
||||
typedef typename ViewL::Storage DestType;
|
||||
|
||||
/// Parameters object
|
||||
struct Params {
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// Source offset coordinate
|
||||
TensorCoord offset;
|
||||
|
||||
/// Element to overwrite with
|
||||
DestType value;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructs a parameters object for filling a tensor
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(
|
||||
ViewL const &lhs,
|
||||
DestType const &value,
|
||||
TensorCoord const &offset = TensorCoord()
|
||||
):
|
||||
lhs(lhs), value(value), offset(offset) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
Params params;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorFillElementFunc(
|
||||
Params const ¶ms): params(params) { }
|
||||
|
||||
/// Equality check
|
||||
CUTLASS_HOST_DEVICE
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
TensorCoord dst_coord = params.offset + coord;
|
||||
|
||||
if (dst_coord < params.size) {
|
||||
params.lhs.at(dst_coord) = params.value;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Method to perform the actual fill
|
||||
template <typename ViewL>
|
||||
void TensorFillElement(
|
||||
ViewL const &lhs,
|
||||
typename ViewL::Storage const &value,
|
||||
typename ViewL::TensorCoord const &offset,
|
||||
typename ViewL::TensorCoord const &size) {
|
||||
|
||||
// Fill function
|
||||
typedef detail::TensorFillElementFunc<ViewL> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
Params params(lhs, value, offset);
|
||||
|
||||
TensorForEach<Func, ViewL::kRank, Params>(size, params);
|
||||
}
|
||||
|
||||
/// Fills a tensor
|
||||
template <typename ViewL>
|
||||
void TensorFillElement(
|
||||
ViewL lhs,
|
||||
typename ViewL::Storage value,
|
||||
typename ViewL::TensorCoord const &offset =typename ViewL::Storage()) {
|
||||
|
||||
TensorFillElement(lhs, value, offset, lhs.size() - offset);
|
||||
}
|
||||
|
||||
/// Constructs a parameters object for filling a tensor
|
||||
template <typename ViewL>
|
||||
void TensorFillElement(
|
||||
ViewL lhs,
|
||||
typename ViewL::Storage value,
|
||||
typename ViewL::Storage const &offset,
|
||||
typename ViewL::Storage const &size) {
|
||||
|
||||
TensorFillElement(lhs, value, offset, size);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
|
||||
72
tools/util/reference/device/tensor_foreach.h
Normal file
72
tools/util/reference/device/tensor_foreach.h
Normal file
@ -0,0 +1,72 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <stdexcept>
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "tools/util/reference/device/kernel/tensor_foreach.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace device {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Launches a kernel for each element in a tensor's index space.
|
||||
template <typename Func, int Rank, typename Params>
|
||||
struct TensorForEach {
|
||||
|
||||
/// Constructor performs the operation.
|
||||
TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
|
||||
|
||||
if (!grid_size || !block_size) {
|
||||
|
||||
// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
|
||||
cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
|
||||
&grid_size,
|
||||
&block_size,
|
||||
reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("Failed to query occupancy.");
|
||||
}
|
||||
|
||||
// Limit block size. This has the effect of increasing the number of items processed by a
|
||||
// single thread and reduces the impact of initialization overhead.
|
||||
block_size = (block_size < 128 ? block_size : 128);
|
||||
}
|
||||
|
||||
dim3 grid(grid_size, 1, 1);
|
||||
dim3 block(block_size, 1, 1);
|
||||
|
||||
kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reference
|
||||
} // namesace cutlass
|
||||
270
tools/util/reference/host/gemm.h
Normal file
270
tools/util/reference/host/gemm.h
Normal file
@ -0,0 +1,270 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Reference implementation for GEMM in host-side code.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/coord.h"
|
||||
#include "cutlass/matrix_traits.h"
|
||||
#include "cutlass/tensor_view.h"
|
||||
#include "cutlass/gemm/gemm_coord.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace host {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Template function to compute an inner product.
|
||||
template <typename Atype, typename Btype, typename Ctype>
|
||||
Ctype inner_product(Atype a, Btype b, Ctype c) {
|
||||
return Ctype(a) * Ctype(b) + c;
|
||||
}
|
||||
|
||||
/// Specialization for matrix multiplication with binary operands
|
||||
template <>
|
||||
inline int inner_product<Vector<bin1_t, 32>, Vector<bin1_t, 32>, int>(
|
||||
Vector<bin1_t, 32> a,
|
||||
Vector<bin1_t, 32> b,
|
||||
int c) {
|
||||
|
||||
int accum = 0;
|
||||
for (int bit = 0; bit < 32; bit++) {
|
||||
accum += a[bit] ^ b[bit];
|
||||
}
|
||||
return accum + c;
|
||||
}
|
||||
|
||||
/// Specialization for matrix multiplication with signed 4-bit integer operands
|
||||
template <> inline
|
||||
int inner_product<Vector<int4_t, 8>, Vector<int4_t, 8>, int>(
|
||||
Vector<int4_t, 8> a,
|
||||
Vector<int4_t, 8> b,
|
||||
int c) {
|
||||
|
||||
int accum = 0;
|
||||
for (int k = 0; k < 8; k++) {
|
||||
accum += a[k] * b[k];
|
||||
}
|
||||
return accum + c;
|
||||
}
|
||||
|
||||
/// Specialization for matrix multiplication with unsigned 4-bit integer operands
|
||||
template <> inline
|
||||
int inner_product<Vector<uint4_t, 8>, Vector<uint4_t, 8>, int>(
|
||||
Vector<uint4_t, 8> a,
|
||||
Vector<uint4_t, 8> b,
|
||||
int c) {
|
||||
|
||||
int accum = 0;
|
||||
for (int k = 0; k < 8; k++) {
|
||||
accum += a[k] * b[k];
|
||||
}
|
||||
return accum + c;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename SrcType, typename DstType>
|
||||
struct Cast {
|
||||
// Default behavior: convert to the destination type
|
||||
static inline DstType apply(SrcType src) { return static_cast<DstType>(src); };
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Cast<float, int8_t> {
|
||||
static inline int8_t apply(float src) {
|
||||
// Clamp to the range of signed 8-bit integers.
|
||||
return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Cast<float, uint8_t> {
|
||||
static inline uint8_t apply(float src) {
|
||||
// Clamp to the range of signed 8-bit integers.
|
||||
return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
|
||||
/// objects.
|
||||
///
|
||||
/// Explicitly naming types needed by this template can be cumbersome, particularly for the
|
||||
/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
|
||||
/// AccumulatorType(0) as the last function argument can be easier than naming all template
|
||||
/// arguments explicitly.
|
||||
template <
|
||||
typename TensorRefA,
|
||||
typename TensorRefB,
|
||||
typename TensorRefC,
|
||||
typename ScalarType,
|
||||
typename AccumulatorType
|
||||
>
|
||||
void Gemm(
|
||||
gemm::GemmCoord problem_size,
|
||||
ScalarType alpha,
|
||||
TensorRefA tensor_a,
|
||||
TensorRefB tensor_b,
|
||||
ScalarType beta,
|
||||
TensorRefC tensor_c,
|
||||
AccumulatorType initial_accum) {
|
||||
|
||||
typedef typename TensorRefA::Storage AType;
|
||||
typedef typename TensorRefB::Storage BType;
|
||||
typedef typename TensorRefC::Storage CType;
|
||||
|
||||
static_assert(
|
||||
TensorRefA::kRank == 2 &&
|
||||
TensorRefB::kRank == 2 &&
|
||||
TensorRefC::kRank == 2, "Tensors must be of rank 2");
|
||||
|
||||
// Note: batch is ignored.
|
||||
int const M = problem_size.m();
|
||||
int const N = problem_size.n();
|
||||
int const K = problem_size.k();
|
||||
|
||||
// Blocking necessary to speedup reference implementation
|
||||
int const Mblock = 32;
|
||||
int const Nblock = 32;
|
||||
|
||||
for (int row_block = 0; row_block < M; row_block += Mblock) {
|
||||
for (int col_block = 0; col_block < N; col_block += Nblock) {
|
||||
AccumulatorType accum[Mblock][Nblock];
|
||||
|
||||
for (int j = 0; j < Nblock; j++) {
|
||||
for (int i = 0; i < Mblock; i++) {
|
||||
accum[i][j] = initial_accum;
|
||||
}
|
||||
}
|
||||
|
||||
for (int k_block = 0; k_block < K; ++k_block) {
|
||||
for (int j = 0; j < Nblock; j++) {
|
||||
for (int i = 0; i < Mblock; i++) {
|
||||
int row = row_block + i;
|
||||
int col = col_block + j;
|
||||
|
||||
if (row < M && col < N) {
|
||||
AType a = tensor_a.at(MatrixCoord(row, k_block));
|
||||
BType b = tensor_b.at(MatrixCoord(k_block, col));
|
||||
|
||||
accum[i][j] = detail::inner_product(a, b, accum[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < Nblock; j++) {
|
||||
for (int i = 0; i < Mblock; i++) {
|
||||
int row = row_block + i;
|
||||
int col = col_block + j;
|
||||
|
||||
MatrixCoord coord = MatrixCoord(row, col);
|
||||
if (row < M && col < N) {
|
||||
|
||||
tensor_c.at(coord) = detail::Cast<ScalarType, CType>::apply(
|
||||
alpha * ScalarType(accum[i][j]) +
|
||||
beta * ScalarType(tensor_c.at(coord)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
|
||||
/// objects.
|
||||
///
|
||||
/// This assumes the accumulator type is the same type as the scalars.
|
||||
template <
|
||||
typename TensorRefA,
|
||||
typename TensorRefB,
|
||||
typename TensorRefC,
|
||||
typename ScalarType
|
||||
>
|
||||
void Gemm(
|
||||
gemm::GemmCoord problem_size,
|
||||
ScalarType alpha,
|
||||
TensorRefA tensor_a,
|
||||
TensorRefB tensor_b,
|
||||
ScalarType beta,
|
||||
TensorRefC tensor_c) {
|
||||
|
||||
Gemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Computes a batch of GEMMs over a set of matrices of common dimension.
|
||||
template <
|
||||
typename TensorRefCollectionA,
|
||||
typename TensorRefCollectionB,
|
||||
typename TensorRefCollectionC,
|
||||
typename ScalarType,
|
||||
typename AccumulatorType
|
||||
>
|
||||
void BatchGemm(
|
||||
gemm::GemmCoord problem_size,
|
||||
ScalarType alpha,
|
||||
TensorRefCollectionA const& tensor_a,
|
||||
TensorRefCollectionB const& tensor_b,
|
||||
ScalarType beta,
|
||||
TensorRefCollectionC &tensor_c,
|
||||
AccumulatorType initial_accum = AccumulatorType(0)) {
|
||||
|
||||
typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
|
||||
typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
|
||||
typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
|
||||
|
||||
for (int batch = 0;
|
||||
batch < problem_size.batch();
|
||||
++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
|
||||
|
||||
Gemm(
|
||||
problem_size,
|
||||
alpha,
|
||||
*tensor_a_it,
|
||||
*tensor_b_it,
|
||||
beta,
|
||||
*tensor_c_it,
|
||||
initial_accum);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace host
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
478
tools/util/reference/host/tensor_elementwise.h
Normal file
478
tools/util/reference/host/tensor_elementwise.h
Normal file
@ -0,0 +1,478 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Defines host-side elementwise operations on TensorView.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// Standard Library includes
|
||||
#include <fstream>
|
||||
#include <ostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
// Cutlass includes
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "tools/util/distribution.h"
|
||||
#include "tools/util/type_traits.h"
|
||||
#include "tools/util/reference/host/tensor_foreach.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace host {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Computes a random uniform distribution
|
||||
template <typename View_>
|
||||
struct RandomUniformFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
/// Parameters structure
|
||||
struct Params {
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// RNG seed
|
||||
unsigned seed;
|
||||
|
||||
/// Distriubtion
|
||||
Distribution dist;
|
||||
|
||||
/// Default ctor
|
||||
Params() { }
|
||||
|
||||
/// Constructor
|
||||
Params(
|
||||
View const &view,
|
||||
unsigned seed,
|
||||
Distribution dist
|
||||
): view(view), seed(seed), dist(dist) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Parameters object
|
||||
Params params;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Device-side initialization of RNG
|
||||
RandomUniformFunc(Params const ¶ms): params(params) {
|
||||
std::srand(params.seed);
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
double range = params.dist.uniform.max - params.dist.uniform.min;
|
||||
|
||||
double rnd = double(std::rand()) / double(RAND_MAX);
|
||||
|
||||
rnd = params.dist.uniform.min + range * rnd;
|
||||
|
||||
// Random values are cast to integer after scaling by a power of two to facilitate error
|
||||
// testing
|
||||
T result;
|
||||
if (params.dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
|
||||
result = T(rnd / double(1 << params.dist.int_scale));
|
||||
}
|
||||
else {
|
||||
result = T(rnd);
|
||||
}
|
||||
|
||||
params.view.at(coord) = result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Computes a random Gaussian distribution
|
||||
template <typename View_>
|
||||
struct RandomGaussianFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
/// Parameters structure
|
||||
struct Params {
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// RNG seed
|
||||
unsigned seed;
|
||||
|
||||
/// RNG distribution
|
||||
Distribution dist;
|
||||
|
||||
/// Default ctor
|
||||
Params() { }
|
||||
|
||||
/// Constructor
|
||||
Params(
|
||||
View const &view,
|
||||
unsigned seed,
|
||||
Distribution dist
|
||||
): view(view), seed(seed), dist(dist) { }
|
||||
};
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Parameters object
|
||||
Params params;
|
||||
|
||||
/// Constant PI
|
||||
double pi;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Device-side initialization of RNG
|
||||
RandomGaussianFunc(Params const ¶ms): params(params) {
|
||||
pi = std::acos(-1);
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
void operator()(TensorCoord const &coord) {
|
||||
|
||||
// Box-Muller transform to generate random numbers with Normal distribution
|
||||
double u1 = double(std::rand()) / double(RAND_MAX);
|
||||
double u2 = double(std::rand()) / double(RAND_MAX);
|
||||
|
||||
double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
|
||||
|
||||
// Scale according to Gaussian distribution parameters
|
||||
rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
|
||||
|
||||
T result;
|
||||
if (params.dist.int_scale >= 0) {
|
||||
rnd = double(int(rnd * double(1 << params.dist.int_scale)));
|
||||
result = T(rnd / double(1 << params.dist.int_scale));
|
||||
}
|
||||
else {
|
||||
result = T(rnd);
|
||||
}
|
||||
|
||||
params.view.at(coord) = result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Computes a linear combination of each element
|
||||
template <typename View_>
|
||||
struct LinearCombinationFunc {
|
||||
|
||||
/// View type
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// TensorView object
|
||||
View view;
|
||||
|
||||
/// Delta
|
||||
Coord<View::kRank, double> delta;
|
||||
|
||||
/// Offset
|
||||
double offset;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
LinearCombinationFunc(
|
||||
View const &view,
|
||||
Distribution dist
|
||||
): view(view) {
|
||||
|
||||
offset = dist.linear.offset;
|
||||
if (View::kRank >= 1) {
|
||||
delta[View::kRank - 1] = dist.linear.delta_column;
|
||||
}
|
||||
if (View::kRank >= 2) {
|
||||
delta[View::kRank - 2] = dist.linear.delta_row;
|
||||
}
|
||||
// Additional ranks have delta of zero
|
||||
for (int i = View::kRank - 2; i > 0; --i) {
|
||||
delta[i - 1] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute linear combination
|
||||
void operator()(TensorCoord const &coord) {
|
||||
double result = offset;
|
||||
|
||||
for (int i = 0; i < View::kRank; ++i) {
|
||||
result += delta[i] * double(coord[i]);
|
||||
}
|
||||
view.at(coord) = T(result);
|
||||
}
|
||||
};
|
||||
|
||||
/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
|
||||
template <typename View_>
|
||||
struct IdentityFunc {
|
||||
|
||||
/// TensorView
|
||||
typedef View_ View;
|
||||
|
||||
/// Scalar type
|
||||
typedef typename View::Storage T;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename View::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View object
|
||||
View view;
|
||||
|
||||
/// Default ctor
|
||||
IdentityFunc(View const &view): view(view) { }
|
||||
|
||||
/// Computes an identity
|
||||
void operator()(TensorCoord const &coord) {
|
||||
bool equal = true;
|
||||
for (int i = 0; i < View::kRank; ++i) {
|
||||
if (coord[i] != coord[0]) {
|
||||
equal = false;
|
||||
}
|
||||
}
|
||||
view.at(coord) = equal ? T(1) : T(0);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initializes a tensor randomly or procedurally.
|
||||
template <typename View>
|
||||
void TensorInitialize(View const &view,
|
||||
unsigned seed,
|
||||
Distribution const &dist) {
|
||||
|
||||
typedef typename View::Storage Scalar;
|
||||
|
||||
switch (dist.kind) {
|
||||
case Distribution::Uniform:
|
||||
{
|
||||
typedef detail::RandomUniformFunc<View> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
TensorForEach<Func, View::kRank, Params>(
|
||||
view.size(),
|
||||
Params(view, seed, dist)
|
||||
);
|
||||
}
|
||||
break;
|
||||
case Distribution::Gaussian:
|
||||
{
|
||||
typedef detail::RandomGaussianFunc<View> Func;
|
||||
typedef typename Func::Params Params;
|
||||
|
||||
TensorForEach<Func, View::kRank, Params>(
|
||||
view.size(),
|
||||
Params(view, seed, dist)
|
||||
);
|
||||
}
|
||||
break;
|
||||
case Distribution::Linear:
|
||||
{
|
||||
typedef detail::LinearCombinationFunc<View> Func;
|
||||
TensorForEach<Func, View::kRank, Func>(
|
||||
view.size(),
|
||||
Func(view, dist));
|
||||
}
|
||||
break;
|
||||
case Distribution::Identity:
|
||||
{
|
||||
typedef detail::IdentityFunc<View> Func;
|
||||
|
||||
Func func(view);
|
||||
|
||||
TensorForEach<Func, View::kRank, Func>(view.size(), func);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Compares two tensor views of equal rank and dimension.
|
||||
template <typename ViewL, typename ViewR>
|
||||
struct TensorEqualsFunc {
|
||||
|
||||
/// Storage type
|
||||
typedef typename ViewL::Storage T;
|
||||
|
||||
/// Unsigned integer type of same size as View type
|
||||
typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
/// Assertions
|
||||
static_assert(ViewL::kRank == ViewR::kRank,
|
||||
"Cannot compare tensors of different rank");
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// View of right-hand-side tensor
|
||||
ViewR rhs;
|
||||
|
||||
/// Pointer to result scalar - only written with 0 if values are incorrect
|
||||
int *result;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
|
||||
|
||||
/// Equality check
|
||||
void operator()(TensorCoord const &coord) {
|
||||
UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
|
||||
UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
|
||||
if (_lhs != _rhs) {
|
||||
*result = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Returns true if two tensor views are equal.
|
||||
template <typename ViewL, typename ViewR>
|
||||
bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
|
||||
|
||||
// Sizes must be identical
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int result = 1;
|
||||
|
||||
typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
|
||||
Func func(lhs, rhs, &result);
|
||||
|
||||
TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
|
||||
|
||||
return result != 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Helper to apply a binary operator in place
|
||||
template <typename ViewL, typename ViewR, typename BinaryFunc>
|
||||
struct TensorFuncBinaryOp {
|
||||
|
||||
/// Coordinate in tensor's index space
|
||||
typedef typename ViewL::TensorCoord TensorCoord;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// View of left-hand-side tensor
|
||||
ViewL lhs;
|
||||
|
||||
/// View of right-hand-side tensor
|
||||
ViewR rhs;
|
||||
|
||||
/// Binary function applied to each element
|
||||
BinaryFunc func;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Constructor
|
||||
TensorFuncBinaryOp(
|
||||
ViewL const &lhs,
|
||||
ViewR const &rhs,
|
||||
BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
|
||||
|
||||
/// Equality check
|
||||
void operator()(TensorCoord const &coord) {
|
||||
lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace host
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
102
tools/util/reference/host/tensor_foreach.h
Normal file
102
tools/util/reference/host/tensor_foreach.h
Normal file
@ -0,0 +1,102 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <stdexcept>
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "tools/util/reference/device/kernel/tensor_foreach.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace host {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Defines several helpers
|
||||
namespace detail {
|
||||
|
||||
/// Helper to perform for-each operation
|
||||
template <typename Func, int Rank, int RankRemaining>
|
||||
struct TensorForEachHelper {
|
||||
|
||||
/// Index of the active rank
|
||||
static int const kActiveRank = Rank - RankRemaining - 1;
|
||||
|
||||
/// Constructor for general rank
|
||||
TensorForEachHelper(
|
||||
Func &func,
|
||||
Coord<Rank> const &size,
|
||||
Coord<Rank> &coord) {
|
||||
|
||||
for (int i = 0; i < size.at(kActiveRank); ++i) {
|
||||
coord[kActiveRank] = i;
|
||||
TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, size, coord);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Helper to perform for-each operation
|
||||
template <typename Func, int Rank>
|
||||
struct TensorForEachHelper<Func, Rank, 0> {
|
||||
|
||||
/// Index of the active rank
|
||||
static int const kActiveRank = Rank - 1;
|
||||
|
||||
/// Constructor for fastest chaning rank
|
||||
TensorForEachHelper(
|
||||
Func &func,
|
||||
Coord<Rank> const &size,
|
||||
Coord<Rank> &coord) {
|
||||
|
||||
for (int i = 0; i < size.at(kActiveRank); ++i) {
|
||||
coord[kActiveRank] = i;
|
||||
func(coord);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Iterates over the index space of a tensor
|
||||
template <typename Func, int Rank, typename Params>
|
||||
struct TensorForEach {
|
||||
|
||||
/// Constructor performs the operation.
|
||||
TensorForEach(Coord<Rank> size, Params params = Params()) {
|
||||
|
||||
Func func(params);
|
||||
Coord<Rank> coord;
|
||||
|
||||
detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord);
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace host
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
Reference in New Issue
Block a user