Checkpointing CUTLASS 1.1 release.

2018-09-18 16:58:03 -07:00
parent cf0301e00f
commit 461f417b9d
193 changed files with 29496 additions and 4771 deletions
--- a/tools/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/reference/device/kernel/tensor_elementwise.h
@ -0,0 +1,162 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <curand_kernel.h>
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize tensor to uniform random distribution
+template <typename T>
+__global__ void TensorInitializeUniform(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      double range = dist.uniform.max - dist.uniform.min;
+
+      double rnd = curand_uniform(&rng_state[threadIdx.x]);
+
+      rnd = dist.uniform.min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+
+      tensor += ldm;
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize tensor to uniform distribution
+template <typename T>
+__global__ void TensorInitializeGaussian(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      double rnd = curand_normal(&rng_state[threadIdx.x]);
+
+      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
+
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+    }
+  }
+}
+
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeLinear(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor =
+          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
+    }
+  }
+}
+
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeIdentity(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor = (c_idx == s_idx ? T(1) : T(0));
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass
--- a/tools/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/reference/device/kernel/tensor_foreach.h
@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines several helpers
+namespace detail {
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+
+  /// Constructor for general rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+
+    int64_t product = 1;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Rank - RankRemaining; i < Rank; ++i) {
+      product *= size[i];
+    }
+
+    coord[Rank - 1 - RankRemaining] = index / product;
+    int64_t remaining = index % product;
+    
+    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
+  }
+};
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+
+  /// Constructor for fastest chaning rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+
+    coord[Rank - 1] = index;
+
+    if (coord < size) {
+      func(coord);
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank, typename Params>
+__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
+
+  Func func(params);
+
+  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t max_index = 1;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    max_index *= size[i];
+  }
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  while  (index < max_index) {
+    Coord<Rank> coord;
+
+    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
+    index += blockDim.x * gridDim.x;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
--- a/tools/util/reference/device/tensor_elementwise.h
+++ b/tools/util/reference/device/tensor_elementwise.h
@ -0,0 +1,772 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <fstream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+// CUDA includes
+#include <cublas_v2.h>
+#include <curand_kernel.h>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "tools/util/device_memory.h"
+#include "tools/util/distribution.h"
+#include "tools/util/type_traits.h"
+#include "tools/util/host_tensor.h"
+#include "tools/util/reference/device/tensor_foreach.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random uniform distribution
+template <typename View_>
+struct RandomUniformFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    /// View object
+    View view;
+
+    /// RNG seed
+    int64_t seed;
+
+    /// Distriubtion
+    Distribution dist;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Constructor
+    CUTLASS_HOST_DEVICE
+    Params(
+      View const &view,
+      int64_t seed,
+      Distribution dist
+    ): view(view), seed(seed), dist(dist) { }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomUniformFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    double range = params.dist.uniform.max - params.dist.uniform.min;
+    double rnd = curand_uniform(&rng_state);
+    rnd = params.dist.uniform.min + range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    T result;
+    if (params.dist.int_scale >= 0) {
+      rnd = double(int(rnd * double(1 << params.dist.int_scale)));
+      result = T(rnd / double(1 << params.dist.int_scale));
+    }
+    else {
+      result = T(rnd);
+    }
+
+    params.view.at(coord) = result;
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <typename View_>
+struct RandomGaussianFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    /// View object
+    View view;
+
+    /// RNG seed
+    int64_t seed;
+
+    /// RNG distribution
+    Distribution dist;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Constructor
+    CUTLASS_HOST_DEVICE
+    Params(
+      View const &view,
+      int64_t seed,
+      Distribution dist
+    ): view(view), seed(seed), dist(dist) { }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomGaussianFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    double rnd = curand_normal(&rng_state);
+    rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
+
+    T result;
+    if (params.dist.int_scale >= 0) {
+      rnd = double(int(rnd * double(1 << params.dist.int_scale)));
+      result = T(rnd / double(1 << params.dist.int_scale));
+    }
+    else {
+      result = T(rnd);
+    }
+
+    params.view.at(coord) = result;
+  }
+};
+
+/// Computes a linear combination of each element
+template <typename View_>
+struct LinearCombinationFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// TensorView object
+  View view;
+
+  /// Delta
+  Coord<View::kRank, double> delta;
+
+  /// Offset
+  double offset;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  LinearCombinationFunc(
+    View const &view,
+    Distribution dist
+  ): view(view) {
+
+    offset = dist.linear.offset;
+    if (View::kRank >= 1) {
+      delta[View::kRank - 1] = dist.linear.delta_column;
+    }
+    if (View::kRank >= 2) {
+      delta[View::kRank - 2] = dist.linear.delta_row;
+    }
+    // Additional ranks have delta of zero
+    for (int i = View::kRank - 2; i > 0; --i) {
+      delta[i - 1] = 0;
+    }
+  }
+
+  /// Compute linear combination
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+    double result = offset;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < View::kRank; ++i) {
+      result += delta[i] * double(coord[i]);
+    }
+    view.at(coord) = T(result);
+  }
+};
+
+/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
+template <typename View_>
+struct IdentityFunc {
+
+  /// TensorView
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// View object
+  View view;
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  IdentityFunc(View const &view): view(view) { }
+
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool equal = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < View::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        equal = false;
+      }
+    }
+    view.at(coord) = equal ? T(1) : T(0);
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initializes a tensor randomly or procedurally.
+template <typename View>
+void TensorInitialize(View const &view,
+                      int64_t seed,
+                      Distribution const &dist) {
+
+  typedef typename View::Storage Scalar;
+
+  switch (dist.kind) {
+    case Distribution::Uniform:
+    {
+      typedef detail::RandomUniformFunc<View> Func;
+      typedef typename Func::Params Params;
+
+      TensorForEach<Func, View::kRank, Params>(
+        view.size(),
+        Params(view, seed, dist)
+      );
+    }
+      break;
+    case Distribution::Gaussian:
+    {
+      typedef detail::RandomGaussianFunc<View> Func;
+      typedef typename Func::Params Params;
+
+      TensorForEach<Func, View::kRank, Params>(
+        view.size(),
+        Params(view, seed, dist)
+      );
+    }
+      break;
+    case Distribution::Linear:
+    {
+      typedef detail::LinearCombinationFunc<View> Func;
+      TensorForEach<Func, View::kRank, Func>(
+        view.size(),
+        Func(view, dist));
+    }
+      break;
+    case Distribution::Identity:
+    {
+      typedef detail::IdentityFunc<View> Func;
+
+      Func func(view);
+
+      TensorForEach<Func, View::kRank, Func>(view.size(), func);
+    }
+      break;
+    default:
+      break;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to appropriate initialization kernel - preserved for backwards compatibility
+template <typename T>
+inline void tensor_initialize(Distribution const &dist,
+                              int64_t seed,
+                              int dim_contiguous,
+                              int dim_strided,
+                              T *tensor,
+                              int ldm) {
+
+  TensorView<T, 2> view(tensor, make_Coord(ldm, 1), make_Coord(dim_strided, dim_contiguous));
+  reference::device::TensorInitialize(view, seed, dist);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace reference {
+namespace device {
+namespace detail {
+
+/// Compares two tensor views of equal rank and dimension.
+template <typename ViewL, typename ViewR>
+struct TensorEqualsFunc {
+
+  /// Storage type
+  typedef typename ViewL::Storage T;
+
+  /// Unsigned integer type of same size as View type
+  typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  /// Assertions
+  static_assert(ViewL::kRank == ViewR::kRank,
+    "Cannot compare tensors of different rank");
+
+  //
+  // Data members
+  //
+
+  /// View of left-hand-side tensor
+  ViewL lhs;
+
+  /// View of right-hand-side tensor
+  ViewR rhs;
+
+  /// Pointer to result scalar - only written with 0 if values are incorrect
+  int *result;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
+
+  /// Equality check
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+    UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
+    UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
+    if (_lhs != _rhs) {
+      *result = 0;
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are equal.
+template <typename ViewL, typename ViewR>
+bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
+
+  // Sizes must be identical
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+
+  // Allocate device memory to contain result of kernel reduction
+  HostTensor<int, 1> result(1);
+  result.fill(1);
+  result.sync_device();
+
+  typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
+  Func func(lhs, rhs, result.device_data());
+
+  TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
+  result.sync_host();
+
+  return result.at(0) != 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to apply a binary operator in place
+template <typename ViewL, typename ViewR, typename BinaryFunc>
+struct TensorFuncBinaryOp {
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// View of left-hand-side tensor
+  ViewL lhs;
+
+  /// View of right-hand-side tensor
+  ViewR rhs;
+
+  /// Binary function applied to each element
+  BinaryFunc func;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorFuncBinaryOp(
+    ViewL const &lhs,
+    ViewR const &rhs,
+    BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
+
+  /// Equality check
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+    lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper to apply a binary operator in place
+template <typename ViewL, typename ViewR>
+struct TensorFillFunc {
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  /// Destination element type
+  typedef typename ViewL::Storage DestType;
+
+  /// Source element type
+  typedef typename ViewR::Storage SrcType;
+
+  /// Parameters object
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    /// View of left-hand-side tensor
+    ViewL lhs;
+
+    /// View of right-hand-side tensor
+    ViewR rhs;
+
+    /// Source offset coordinate
+    TensorCoord source_offset;
+
+    /// Size of the subtensor copied from the source
+    TensorCoord source_size;
+
+    /// Offset in destination
+    TensorCoord dest_offset;
+
+    //
+    // Methods
+    //
+
+    /// Constructs a parameters object for filling a tensor
+    Params(
+      ViewL const &lhs,
+      ViewR const &rhs,
+      TensorCoord const &source_offset = TensorCoord()
+    ):
+      lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(rhs.size() - source_offset) { }
+
+    /// Constructs a parameters object for filling a tensor
+    Params(
+      ViewL const &lhs,
+      ViewR const &rhs,
+      TensorCoord const &source_offset,
+      TensorCoord const &source_size,
+      TensorCoord const &dest_offset = TensorCoord()
+    ):
+      lhs(lhs), rhs(rhs), source_offset(source_offset), source_size(source_size), dest_offset(dest_offset) { }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorFillFunc(
+    Params const &params): params(params) { }
+
+  /// Equality check
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    TensorCoord dst_coord = params.dest_offset + coord;
+    TensorCoord src_coord = params.source_offset + coord;
+
+    if (dst_coord < params.lhs.size() && src_coord < params.rhs.size()) {
+      params.lhs.at(dst_coord) = DestType(params.rhs.at(src_coord));
+    }
+  }
+};
+
+} // namespace detail
+
+/// Fills a TensorView with the elements from another TensorView
+template <typename ViewL, typename ViewR>
+void TensorFill(
+  ViewL lhs,
+  ViewR rhs,
+  typename ViewL::TensorCoord const &source_offset,
+  typename ViewL::TensorCoord const &source_size,
+  typename ViewL::TensorCoord const &dest_offset) {
+
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  TensorCoord dst_size = lhs.size() - dest_offset;
+  TensorCoord src_size = rhs.size() - source_offset;
+
+  TensorCoord fill_size = dst_size.clamp(src_size);
+
+  // Fill function
+  typedef detail::TensorFillFunc<ViewL, ViewR> Func;
+  typedef typename Func::Params Params;
+
+  Params params(lhs, rhs, source_offset, source_size, dest_offset);
+
+  TensorForEach<Func, ViewL::kRank, Params>(fill_size, params);
+}
+
+/// Fills a TensorView with the elements from another TensorView
+template <typename ViewL, typename ViewR>
+void TensorFill(
+  ViewL lhs,
+  ViewR rhs,
+  typename ViewL::TensorCoord const &source_offset = typename ViewL::TensorCoord()) {
+
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  TensorFill(lhs, rhs, source_offset, rhs.size(), TensorCoord());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper to apply a binary operator in place
+template <typename ViewL>
+struct TensorFillElementFunc {
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  /// Destination element type
+  typedef typename ViewL::Storage DestType;
+
+  /// Parameters object
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    /// View of left-hand-side tensor
+    ViewL lhs;
+
+    /// Source offset coordinate
+    TensorCoord offset;
+
+    /// Element to overwrite with
+    DestType value;
+
+    //
+    // Methods
+    //
+
+    /// Constructs a parameters object for filling a tensor
+    CUTLASS_HOST_DEVICE
+    Params(
+      ViewL const &lhs,
+      DestType const &value,
+      TensorCoord const &offset = TensorCoord()
+    ):
+      lhs(lhs), value(value), offset(offset) { }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorFillElementFunc(
+    Params const &params): params(params) { }
+
+  /// Equality check
+  CUTLASS_HOST_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    TensorCoord dst_coord = params.offset + coord;
+
+    if (dst_coord < params.size) {
+      params.lhs.at(dst_coord) = params.value;
+    }
+  }
+};
+
+} // namespace detail
+
+/// Method to perform the actual fill
+template <typename ViewL>
+void TensorFillElement(
+  ViewL const &lhs,
+  typename ViewL::Storage const &value,
+  typename ViewL::TensorCoord const &offset,
+  typename ViewL::TensorCoord const &size) {
+
+  // Fill function
+  typedef detail::TensorFillElementFunc<ViewL> Func;
+  typedef typename Func::Params Params;
+
+  Params params(lhs, value, offset);
+
+  TensorForEach<Func, ViewL::kRank, Params>(size, params);
+}
+
+/// Fills a tensor
+template <typename ViewL>
+void TensorFillElement(
+  ViewL lhs,
+  typename ViewL::Storage value,
+  typename ViewL::TensorCoord const &offset =typename ViewL::Storage()) {
+
+  TensorFillElement(lhs, value, offset, lhs.size() - offset);
+}
+
+/// Constructs a parameters object for filling a tensor
+template <typename ViewL>
+void TensorFillElement(
+  ViewL lhs,
+  typename ViewL::Storage value,
+  typename ViewL::Storage const &offset,
+  typename ViewL::Storage const &size) {
+
+  TensorFillElement(lhs, value, offset, size);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
--- a/tools/util/reference/device/tensor_foreach.h
+++ b/tools/util/reference/device/tensor_foreach.h
@ -0,0 +1,72 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "tools/util/reference/device/kernel/tensor_foreach.h"
+
+namespace cutlass  {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Launches a kernel for each element in a tensor's index space.
+template <typename Func, int Rank, typename Params>
+struct TensorForEach {
+
+  /// Constructor performs the operation.
+  TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
+
+    if (!grid_size || !block_size) {
+
+      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
+
+      if (result != cudaSuccess) {
+        throw std::runtime_error("Failed to query occupancy.");
+      }
+
+      // Limit block size. This has the effect of increasing the number of items processed by a
+      // single thread and reduces the impact of initialization overhead.
+      block_size = (block_size < 128 ? block_size : 128);
+    }
+
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
+
+    kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namesace cutlass
--- a/tools/util/reference/host/gemm.h
+++ b/tools/util/reference/host/gemm.h
@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/matrix_traits.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm_coord.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Template function to compute an inner product.
+template <typename Atype, typename Btype, typename Ctype>
+Ctype inner_product(Atype a, Btype b, Ctype c) {
+  return Ctype(a) * Ctype(b) + c;
+}
+
+/// Specialization for matrix multiplication with binary operands
+template <>
+inline int inner_product<Vector<bin1_t, 32>, Vector<bin1_t, 32>, int>(
+    Vector<bin1_t, 32> a,
+    Vector<bin1_t, 32> b,
+    int c) {
+
+  int accum = 0;
+  for (int bit = 0; bit < 32; bit++) {
+    accum += a[bit] ^ b[bit];
+  }
+  return accum + c;
+}
+
+/// Specialization for matrix multiplication with signed 4-bit integer operands
+template <> inline
+int inner_product<Vector<int4_t, 8>, Vector<int4_t, 8>, int>(
+    Vector<int4_t, 8> a,
+    Vector<int4_t, 8> b,
+    int c) {
+
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+
+/// Specialization for matrix multiplication with unsigned 4-bit integer operands
+template <> inline
+int inner_product<Vector<uint4_t, 8>, Vector<uint4_t, 8>, int>(
+    Vector<uint4_t, 8> a,
+    Vector<uint4_t, 8> b,
+    int c) {
+
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename SrcType, typename DstType>
+struct Cast {
+  // Default behavior: convert to the destination type
+  static inline DstType apply(SrcType src) { return static_cast<DstType>(src); };
+};
+
+template <>
+struct Cast<float, int8_t> {
+  static inline int8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
+  };
+};
+
+template <>
+struct Cast<float, uint8_t> {
+  static inline uint8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
+  };
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void Gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefA tensor_a,
+  TensorRefB tensor_b,
+  ScalarType beta,
+  TensorRefC tensor_c,
+  AccumulatorType initial_accum) {
+
+  typedef typename TensorRefA::Storage AType;
+  typedef typename TensorRefB::Storage BType;
+  typedef typename TensorRefC::Storage CType;
+
+  static_assert(
+    TensorRefA::kRank == 2 &&
+    TensorRefB::kRank == 2 &&
+    TensorRefC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 32;
+  int const Nblock = 32;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      AccumulatorType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              AType a = tensor_a.at(MatrixCoord(row, k_block));
+              BType b = tensor_b.at(MatrixCoord(k_block, col));
+
+              accum[i][j] = detail::inner_product(a, b, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+
+            tensor_c.at(coord) = detail::Cast<ScalarType, CType>::apply(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType
+>
+void Gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefA tensor_a,
+  TensorRefB tensor_b,
+  ScalarType beta,
+  TensorRefC tensor_c) {
+
+  Gemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a batch of GEMMs over a set of matrices of common dimension.
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchGemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c,
+  AccumulatorType initial_accum = AccumulatorType(0)) {
+
+  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
+  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
+  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
+
+  for (int batch = 0;
+    batch < problem_size.batch();
+    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
+
+    Gemm(
+      problem_size,
+      alpha,
+      *tensor_a_it,
+      *tensor_b_it,
+      beta,
+      *tensor_c_it,
+      initial_accum);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
--- a/tools/util/reference/host/tensor_elementwise.h
+++ b/tools/util/reference/host/tensor_elementwise.h
@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <fstream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "tools/util/distribution.h"
+#include "tools/util/type_traits.h"
+#include "tools/util/reference/host/tensor_foreach.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random uniform distribution
+template <typename View_>
+struct RandomUniformFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    /// View object
+    View view;
+
+    /// RNG seed
+    unsigned seed;
+
+    /// Distriubtion
+    Distribution dist;
+
+    /// Default ctor
+    Params() { }
+
+    /// Constructor
+    Params(
+      View const &view,
+      unsigned seed,
+      Distribution dist
+    ): view(view), seed(seed), dist(dist) { }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  RandomUniformFunc(Params const &params): params(params) {
+    std::srand(params.seed);
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(TensorCoord const &coord) {
+
+    double range = params.dist.uniform.max - params.dist.uniform.min;
+
+    double rnd = double(std::rand()) / double(RAND_MAX);
+
+    rnd = params.dist.uniform.min + range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    T result;
+    if (params.dist.int_scale >= 0) {
+      rnd = double(int(rnd * double(1 << params.dist.int_scale)));
+      result = T(rnd / double(1 << params.dist.int_scale));
+    }
+    else {
+      result = T(rnd);
+    }
+
+    params.view.at(coord) = result;
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <typename View_>
+struct RandomGaussianFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    /// View object
+    View view;
+
+    /// RNG seed
+    unsigned seed;
+
+    /// RNG distribution
+    Distribution dist;
+
+    /// Default ctor
+    Params() { }
+
+    /// Constructor
+    Params(
+      View const &view,
+      unsigned seed,
+      Distribution dist
+    ): view(view), seed(seed), dist(dist) { }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// Constant PI
+  double pi;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  RandomGaussianFunc(Params const &params): params(params) {
+    pi = std::acos(-1);
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(TensorCoord const &coord) {
+
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+
+    // Scale according to Gaussian distribution parameters
+    rnd = params.dist.gaussian.mean + params.dist.gaussian.stddev * rnd;
+
+    T result;
+    if (params.dist.int_scale >= 0) {
+      rnd = double(int(rnd * double(1 << params.dist.int_scale)));
+      result = T(rnd / double(1 << params.dist.int_scale));
+    }
+    else {
+      result = T(rnd);
+    }
+
+    params.view.at(coord) = result;
+  }
+};
+
+/// Computes a linear combination of each element
+template <typename View_>
+struct LinearCombinationFunc {
+
+  /// View type
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// TensorView object
+  View view;
+
+  /// Delta
+  Coord<View::kRank, double> delta;
+
+  /// Offset
+  double offset;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  LinearCombinationFunc(
+    View const &view,
+    Distribution dist
+  ): view(view) {
+
+    offset = dist.linear.offset;
+    if (View::kRank >= 1) {
+      delta[View::kRank - 1] = dist.linear.delta_column;
+    }
+    if (View::kRank >= 2) {
+      delta[View::kRank - 2] = dist.linear.delta_row;
+    }
+    // Additional ranks have delta of zero
+    for (int i = View::kRank - 2; i > 0; --i) {
+      delta[i - 1] = 0;
+    }
+  }
+
+  /// Compute linear combination
+  void operator()(TensorCoord const &coord) {
+    double result = offset;
+
+    for (int i = 0; i < View::kRank; ++i) {
+      result += delta[i] * double(coord[i]);
+    }
+    view.at(coord) = T(result);
+  }
+};
+
+/// Returns 1 or 0 if the coordinate is along the tensor's diagonal
+template <typename View_>
+struct IdentityFunc {
+
+  /// TensorView
+  typedef View_ View;
+
+  /// Scalar type
+  typedef typename View::Storage T;
+
+  /// Coordinate in tensor's index space
+  typedef typename View::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// View object
+  View view;
+
+  /// Default ctor
+  IdentityFunc(View const &view): view(view) { }
+
+  /// Computes an identity
+  void operator()(TensorCoord const &coord) {
+    bool equal = true;
+    for (int i = 0; i < View::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        equal = false;
+      }
+    }
+    view.at(coord) = equal ? T(1) : T(0);
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initializes a tensor randomly or procedurally.
+template <typename View>
+void TensorInitialize(View const &view,
+                      unsigned seed,
+                      Distribution const &dist) {
+
+  typedef typename View::Storage Scalar;
+
+  switch (dist.kind) {
+    case Distribution::Uniform:
+    {
+      typedef detail::RandomUniformFunc<View> Func;
+      typedef typename Func::Params Params;
+
+      TensorForEach<Func, View::kRank, Params>(
+        view.size(),
+        Params(view, seed, dist)
+      );
+    }
+      break;
+    case Distribution::Gaussian:
+    {
+      typedef detail::RandomGaussianFunc<View> Func;
+      typedef typename Func::Params Params;
+
+      TensorForEach<Func, View::kRank, Params>(
+        view.size(),
+        Params(view, seed, dist)
+      );
+    }
+      break;
+    case Distribution::Linear:
+    {
+      typedef detail::LinearCombinationFunc<View> Func;
+      TensorForEach<Func, View::kRank, Func>(
+        view.size(),
+        Func(view, dist));
+    }
+      break;
+    case Distribution::Identity:
+    {
+      typedef detail::IdentityFunc<View> Func;
+
+      Func func(view);
+
+      TensorForEach<Func, View::kRank, Func>(view.size(), func);
+    }
+      break;
+    default:
+      break;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Compares two tensor views of equal rank and dimension.
+template <typename ViewL, typename ViewR>
+struct TensorEqualsFunc {
+
+  /// Storage type
+  typedef typename ViewL::Storage T;
+
+  /// Unsigned integer type of same size as View type
+  typedef typename cutlass::TypeTraits<T>::unsigned_type UnsignedType;
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  /// Assertions
+  static_assert(ViewL::kRank == ViewR::kRank,
+    "Cannot compare tensors of different rank");
+
+  //
+  // Data members
+  //
+
+  /// View of left-hand-side tensor
+  ViewL lhs;
+
+  /// View of right-hand-side tensor
+  ViewR rhs;
+
+  /// Pointer to result scalar - only written with 0 if values are incorrect
+  int *result;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  TensorEqualsFunc(ViewL const &lhs, ViewR const &rhs, int *result): lhs(lhs), rhs(rhs), result(result) { }
+
+  /// Equality check
+  void operator()(TensorCoord const &coord) {
+    UnsignedType _lhs = reinterpret_cast<UnsignedType const &>(lhs.at(coord));
+    UnsignedType _rhs = reinterpret_cast<UnsignedType const &>(rhs.at(coord));
+    if (_lhs != _rhs) {
+      *result = 0;
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are equal.
+template <typename ViewL, typename ViewR>
+bool TensorEquals(ViewL const &lhs, ViewR const &rhs) {
+
+  // Sizes must be identical
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+
+  int result = 1;
+
+  typedef detail::TensorEqualsFunc<ViewL, ViewR> Func;
+  Func func(lhs, rhs, &result);
+
+  TensorForEach<Func, ViewL::kRank, Func>(lhs.size(), func);
+
+  return result != 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to apply a binary operator in place
+template <typename ViewL, typename ViewR, typename BinaryFunc>
+struct TensorFuncBinaryOp {
+
+  /// Coordinate in tensor's index space
+  typedef typename ViewL::TensorCoord TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// View of left-hand-side tensor
+  ViewL lhs;
+
+  /// View of right-hand-side tensor
+  ViewR rhs;
+
+  /// Binary function applied to each element
+  BinaryFunc func;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  TensorFuncBinaryOp(
+    ViewL const &lhs,
+    ViewR const &rhs,
+    BinaryFunc func = BinaryFunc()): lhs(lhs), rhs(rhs), func(func) { }
+
+  /// Equality check
+  void operator()(TensorCoord const &coord) {
+    lhs.at(coord) = func(lhs.at(coord), rhs.at(coord));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
--- a/tools/util/reference/host/tensor_foreach.h
+++ b/tools/util/reference/host/tensor_foreach.h
@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "tools/util/reference/device/kernel/tensor_foreach.h"
+
+namespace cutlass  {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines several helpers
+namespace detail {
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+
+  /// Index of the active rank
+  static int const kActiveRank = Rank - RankRemaining - 1;
+
+  /// Constructor for general rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &size,
+    Coord<Rank> &coord) {
+
+    for (int i = 0; i < size.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, size, coord);
+    }
+  }
+};
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+
+  /// Index of the active rank
+  static int const kActiveRank = Rank - 1;
+
+  /// Constructor for fastest chaning rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &size,
+    Coord<Rank> &coord) {
+
+    for (int i = 0; i < size.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      func(coord);
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over the index space of a tensor
+template <typename Func, int Rank, typename Params>
+struct TensorForEach {
+
+  /// Constructor performs the operation.
+  TensorForEach(Coord<Rank> size, Params params = Params()) {
+
+    Func func(params);
+    Coord<Rank> coord;
+
+    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass