CUTLASS 2.6 (#298)

CUTLASS 2.6
2021-07-22 21:40:53 -07:00
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@ -90,11 +90,19 @@ struct CommandLine {
  /**
   * Returns number of naked (non-flag and non-key-value) commandline parameters
   */
-  template <typename value_t>
-  int num_naked_args() const {
+  size_t num_naked_args() const {
    return args.size();
  }

+  /**
+   * Print naked (non-flag and non-key-value) commandline parameters
+   */
+  void print_naked_args(std::ostream &out) const {
+    for (auto arg : args) {
+      out << "   " << arg <<"\n";
+    }
+  }
+
  /**
   * Returns the commandline parameter for a given index (not including flags)
   */
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@ -325,12 +325,12 @@ public:
  }

  /// Returns the layout object's stride in a given physical dimension
-  Index stride(int dim) const {
+  LongIndex stride(int dim) const {
    return layout_.stride().at(dim);
  }

  /// Returns the layout object's stride in a given physical dimension
-  Index & stride(int dim) {
+  LongIndex & stride(int dim) {
    return layout_.stride().at(dim);
  }

--- a/tools/util/include/cutlass/util/index_sequence.h
+++ b/tools/util/include/cutlass/util/index_sequence.h
@ -0,0 +1,52 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <utility>
+#include "cutlass/cutlass.h"
+
+/**
+ * \file
+ * \brief C++11 version of index_sequence.
+ */
+
+namespace cutlass {
+
+template <size_t... Seq>
+struct index_sequence;
+
+template <size_t N, size_t... Next>
+struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
+
+template <size_t... Next>
+struct index_sequence_helper<0, 0, Next...> {
+  using type = index_sequence<0, Next...>;
+};
+
+template <size_t N>
+using make_index_sequence = typename index_sequence_helper<N>::type;
+
+}  // namespace cutlass
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@ -65,8 +65,8 @@ __global__ void Gemm(

  // Map each thread to a unique tile of the output matrix
  MatrixCoord output_coord(
-    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow,
-    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn
+    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
+    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
  );

  // Compute the general matrix product
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@ -39,6 +39,7 @@
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv2d_problem_size.h"
 #include "cutlass/conv/conv3d_problem_size.h"
+#include <iostream>

 namespace cutlass {
 namespace reference {
@ -243,7 +244,21 @@ void Conv2dDgrad(

                  p = p / problem_size.stride_h;
                  q = q / problem_size.stride_w;
-                  
+#if 0
+                  std::cout << "row:" 
+                  << n * problem_size.H * problem_size.W +
+                    h * problem_size.W +
+                    w << " "
+                  << "n, p, q: (" 
+                  << n << ", "
+                  << p << ", "
+                  << q << ") * "
+                  << "r, s: (" 
+                  << r << ", "
+                  << s << ") [" 
+                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"        
+                  << std::endl;
+#endif
                  if (p < problem_size.P && q < problem_size.Q) {

                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
--- a/tools/util/include/cutlass/util/reference/host/error_metrics.h
+++ b/tools/util/include/cutlass/util/reference/host/error_metrics.h
@ -0,0 +1,60 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cmath>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/core_io.h"
+
+namespace cutlass  {
+namespace reference {
+namespace host {
+
+/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorRelativeErrorMetric(
+  TensorView<Element, Layout> view_A_computed,
+  TensorView<Element, Layout> view_B_reference,
+  ComputeType identity = ComputeType()
+) {
+
+  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
+   cutlass::reference::host::TensorNorm(view_B_reference, identity);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@ -36,6 +36,7 @@
 // Cutlass includes
 #include "cutlass/cutlass.h"
 #include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
 #include "cutlass/array.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/subbyte_reference.h"
@ -219,6 +220,56 @@ struct RandomGaussianFunc<complex<Element> > {
  }
 };

+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomGaussianFunc<Quaternion<Element> > {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+
+    Element reals[4];
+
+    for (int i = 0; i < 4; ++i) {
+      // Box-Muller transform to generate random numbers with Normal distribution
+      double u1 = double(std::rand()) / double(RAND_MAX);
+      double u2 = double(std::rand()) / double(RAND_MAX);
+
+      // Compute Gaussian random value
+      double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+      rnd = mean + stddev * rnd;
+
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(rnd / double(1 << int_scale));
+      }
+      else {
+        reals[i] = from_real<Element>(rnd);
+      }
+    }
+
+    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
 /// Computes a random Gaussian distribution
 template <
  typename Element,               ///< Element type
@ -429,6 +480,58 @@ struct RandomUniformFunc<complex<Element> > {
  }
 };

+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+
+    Element reals[4];
+
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
 /// Computes a random Gaussian distribution
 template <
  typename Element,               ///< Element type
@ -510,6 +613,32 @@ void TensorFillRandomUniform(
  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits);
 }

+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Quaternion<Element>, Layout> dst,        ///< destination tensor
+  uint64_t seed,                                      ///< seed for RNG
+  double max = 1,                                     ///< upper bound of distribution
+  double min = 0,                                     ///< lower bound for distribution
+  int bits = -1) {                                    ///< If non-negative, specifies number of fractional bits that 
+                                                      ///  are not truncated to zero. Permits reducing precision of
+                                                      ///  data.                 
+  detail::RandomUniformFunc<Quaternion<Element>> random_func(seed, max, min, bits);
+
+  detail::TensorFillRandomUniformFunc<Quaternion<Element>, Layout> func(
+    dst,
+    random_func
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /// Fills a tensor with random values with a uniform random distribution.
 template <