CUTLASS 2.6 (#298)

CUTLASS 2.6
2021-07-22 21:40:53 -07:00
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions
--- a/tools/profiler/src/conv2d_operation_profiler.h
+++ b/tools/profiler/src/conv2d_operation_profiler.h
@ -274,9 +274,9 @@ public:
                           library::LayoutTypeID const &layout_a,
                           library::LayoutTypeID const &layout_b,
                           library::LayoutTypeID const &layout_c) {
-      std::vector<int> stride_activations;
-      std::vector<int> stride_filters;
-      std::vector<int> stride_output;
+      std::vector<int64_t> stride_activations;
+      std::vector<int64_t> stride_filters;
+      std::vector<int64_t> stride_output;

      // Strides for interleaved fprop
      if (conv_kind == library::ConvKind::kFprop &&
--- a/tools/profiler/src/conv3d_operation_profiler.h
+++ b/tools/profiler/src/conv3d_operation_profiler.h
@ -268,7 +268,7 @@ public:
      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }

      // Returns stride vector for tensor A
-      std::vector<int> stride_a(library::ConvKind const &conv_kind) {
+      std::vector<int64_t> stride_a(library::ConvKind const &conv_kind) {
        return {        
          configuration.layout_a(conv_kind).stride()[0],
          configuration.layout_a(conv_kind).stride()[1],
@ -278,7 +278,7 @@ public:
      }

      // Returns stride vector for tensor B
-      std::vector<int> stride_b(library::ConvKind const &conv_kind) {
+      std::vector<int64_t> stride_b(library::ConvKind const &conv_kind) {

        return {        
          configuration.layout_b(conv_kind).stride()[0],
@ -289,7 +289,7 @@ public:
      }

      // Returns stride vector for tensor C
-      std::vector<int> stride_c(library::ConvKind const &conv_kind) {
+      std::vector<int64_t> stride_c(library::ConvKind const &conv_kind) {

        return {        
          configuration.layout_c(conv_kind).stride()[0],
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@ -67,6 +67,15 @@ CutlassProfiler::~CutlassProfiler() {
 /// Execute the program
 int CutlassProfiler::operator()() {

+  if (options_.cmdline.num_naked_args() > 0) {
+    std::cerr << "Unknown args: \n";
+    options_.cmdline.print_naked_args(std::cerr);
+    std::cerr << "\n\n\n";
+
+    print_usage_(std::cout);
+    return 1;
+  }
+
  if (options_.about.help) {
    if (options_.operation_kind == library::OperationKind::kInvalid) {
      print_usage_(std::cout);
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@ -54,7 +54,7 @@ size_t DeviceAllocation::bytes(library::NumericTypeID type, size_t capacity) {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 template <typename Layout>
-static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent) {
+static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &extent) {

  typename Layout::TensorCoord extent_coord;
  typename Layout::Stride stride_coord;
@ -67,25 +67,25 @@ static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent)
    extent_coord[i] = extent.at(i);
  }

-  std::vector<int> stride;
+  std::vector<int64_t> stride;
  stride.resize(Layout::kStrideRank, 0);

  Layout layout = Layout::packed(extent_coord);
  stride_coord = layout.stride();

  for (int i = 0; i < Layout::kStrideRank; ++i) {
-    stride.at(i) = stride_coord[i];
+    stride.at(i) = (int64_t)stride_coord[i];
  }

  return stride;
 }

 /// Returns the stride of a packed layout
-std::vector<int> DeviceAllocation::get_packed_layout(
+std::vector<int64_t> DeviceAllocation::get_packed_layout(
  library::LayoutTypeID layout_id, 
  std::vector<int> const &extent) {

-  std::vector<int> stride;
+  std::vector<int64_t> stride;

  switch (layout_id) {
    case library::LayoutTypeID::kColumnMajor: 
@ -159,7 +159,7 @@ static size_t construct_layout_(
  void *bytes,
  library::LayoutTypeID layout_id,
  std::vector<int> const &extent,
-  std::vector<int> &stride) {
+  std::vector<int64_t> &stride) {

  if (extent.size() != Layout::kRank) {
    throw std::runtime_error(
@ -183,7 +183,7 @@ static size_t construct_layout_(

  typename Layout::Stride stride_coord;
  for (int i = 0; i < Layout::kStrideRank; ++i) {
-    stride_coord[i] = stride.at(i);
+    stride_coord[i] = (int)stride.at(i);
  }

  typename Layout::TensorCoord extent_coord;
@ -210,7 +210,7 @@ size_t DeviceAllocation::construct_layout(
  void *bytes,
  library::LayoutTypeID layout_id,
  std::vector<int> const &extent,
-  std::vector<int> &stride) {
+  std::vector<int64_t> &stride) {

  switch (layout_id) {
    case library::LayoutTypeID::kColumnMajor: 
@ -309,7 +309,7 @@ DeviceAllocation::DeviceAllocation(
  library::NumericTypeID type, 
  library::LayoutTypeID layout_id, 
  std::vector<int> const &extent, 
-  std::vector<int> const &stride,
+  std::vector<int64_t> const &stride,
  int batch_count
 ):
  type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
@ -370,12 +370,12 @@ DeviceAllocation &DeviceAllocation::reset(
  library::NumericTypeID type, 
  library::LayoutTypeID layout_id, 
  std::vector<int> const &extent, 
-  std::vector<int> const &stride,
+  std::vector<int64_t> const &stride,
  int batch_count) {

  reset();

-  tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int) * library::get_layout_stride_rank(layout_id)), 0);
+  tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int64_t) * library::get_layout_stride_rank(layout_id)), 0);

  type_ = type;

@ -422,7 +422,7 @@ library::LayoutTypeID DeviceAllocation::layout() const {
  return layout_;
 }

-std::vector<int> const & DeviceAllocation::stride() const {
+std::vector<int64_t> const & DeviceAllocation::stride() const {
  return stride_;
 }

@ -1277,6 +1277,15 @@ struct vector_to_coord {
      vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
    }
  }
+
+  vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
+
+    coord[Rank - 1] = (int)vec.at(Rank - 1);
+    
+    if (Rank > 1) {
+      vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
+    }
+  }
 };

 /// Permits copying dynamic vectors into static-length vectors 
@ -1287,6 +1296,11 @@ struct vector_to_coord<TensorCoord, 1> {

    coord[0] = vec.at(0);
  }
+
+  vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
+
+    coord[0] = (int)vec.at(0);
+  }
 };

 /// Permits copying dynamic vectors into static-length vectors 
@ -1306,7 +1320,7 @@ static void write_tensor_csv_static_tensor_view(
  DeviceAllocation &allocation) {

  Coord<Layout::kRank> extent;
-  Coord<Layout::kStrideRank> stride;
+  Coord<Layout::kStrideRank, typename Layout::Stride::Index> stride;

  if (allocation.extent().size() != Layout::kRank) {
    throw std::runtime_error("Allocation extent has invalid rank");
@ -1317,7 +1331,8 @@ static void write_tensor_csv_static_tensor_view(
  }

  vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
-  vector_to_coord<Coord<Layout::kStrideRank>, Layout::kStrideRank>(stride, allocation.stride());
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>, 
+                        Layout::kStrideRank>(stride, allocation.stride());

  Layout layout(stride);
  HostTensor<Element, Layout> host_tensor(extent, layout, false);
@ -1498,6 +1513,162 @@ void DeviceAllocation::write_tensor_csv(
  }
 }

+template <typename Element, typename Layout>
+static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val = Element()) {
+  Coord<Layout::kRank> extent;
+  Coord<Layout::kStrideRank, typename Layout::LongIndex> stride;
+
+  if (allocation.extent().size() != Layout::kRank) {
+    throw std::runtime_error("Allocation extent has invalid rank");
+  }
+
+  if (allocation.stride().size() != Layout::kStrideRank) {
+    throw std::runtime_error("Allocation stride has invalid rank");
+  }
+
+  vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>, 
+                        Layout::kStrideRank>(stride, allocation.stride());
+
+  TensorView<Element, Layout> view(
+    static_cast<Element *>(allocation.data()),
+    Layout(stride),
+    extent
+  );
+
+
+  cutlass::reference::device::TensorFill<Element, Layout>(
+    view,
+    val
+  );
+}
+
+template <typename Element>
+static void tensor_fill(DeviceAllocation &allocation, Element val = Element()) {
+  switch (allocation.layout()) {
+    case library::LayoutTypeID::kRowMajor:
+      tensor_fill_tensor_view<Element, layout::RowMajor>(allocation, val);
+      break;
+    case library::LayoutTypeID::kColumnMajor:
+      tensor_fill_tensor_view<Element, layout::ColumnMajor>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorNHWC:
+      tensor_fill_tensor_view<Element, layout::TensorNHWC>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorNDHWC:
+      tensor_fill_tensor_view<Element, layout::TensorNDHWC>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorNC32HW32:
+      tensor_fill_tensor_view<Element, layout::TensorNCxHWx<32>>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorNC64HW64:
+      tensor_fill_tensor_view<Element, layout::TensorNCxHWx<64>>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorC32RSK32:
+      tensor_fill_tensor_view<Element, layout::TensorCxRSKx<32>>(allocation, val);
+      break;
+    case library::LayoutTypeID::kTensorC64RSK64:
+      tensor_fill_tensor_view<Element, layout::TensorCxRSKx<64>>(allocation, val);
+      break;
+    default:
+    throw std::runtime_error("Unsupported layout");
+      break;
+  }
+}
+
+/// Fills a tensor uniformly with a value (most frequently used to clear the tensor)
+void DeviceAllocation::fill(double val = 0.0) {
+
+  switch (this->type()) {
+  case library::NumericTypeID::kF16:
+    tensor_fill<half_t>(*this, static_cast<half_t>(val));
+    break;
+
+  case library::NumericTypeID::kBF16:
+    tensor_fill<bfloat16_t>(*this, static_cast<bfloat16_t>(val));
+    break;
+
+  case library::NumericTypeID::kTF32:
+    tensor_fill<tfloat32_t>(*this, static_cast<tfloat32_t>(val));
+    break;
+
+  case library::NumericTypeID::kF32:
+    tensor_fill<float>(*this, static_cast<float>(val));
+    break;
+
+  case library::NumericTypeID::kF64:
+    tensor_fill<double>(*this, static_cast<double>(val));
+    break;
+
+  case library::NumericTypeID::kS2:
+    tensor_fill<int2b_t>(*this, static_cast<int2b_t>(val));
+    break;
+
+  case library::NumericTypeID::kS4:
+    tensor_fill<int4b_t>(*this, static_cast<int4b_t>(val));
+    break;
+
+  case library::NumericTypeID::kS8:
+    tensor_fill<int8_t>(*this, static_cast<int8_t>(val));
+    break;
+
+  case library::NumericTypeID::kS16:
+    tensor_fill<int16_t>(*this, static_cast<int16_t>(val));
+    break;
+
+  case library::NumericTypeID::kS32:
+    tensor_fill<int32_t>(*this, static_cast<int32_t>(val));
+    break;
+
+  case library::NumericTypeID::kS64:
+    tensor_fill<int64_t>(*this, static_cast<int64_t>(val));
+    break;
+
+  case library::NumericTypeID::kB1:
+    tensor_fill<uint1b_t>(*this, static_cast<uint1b_t>(val));
+    break;
+
+  case library::NumericTypeID::kU2:
+    tensor_fill<uint2b_t>(*this, static_cast<uint2b_t>(val));
+    break;
+
+  case library::NumericTypeID::kU4:
+    tensor_fill<uint4b_t>(*this, static_cast<uint4b_t>(val));
+    break;
+
+  case library::NumericTypeID::kU8:
+    tensor_fill<uint8_t>(*this, static_cast<uint8_t>(val));
+    break;
+
+  case library::NumericTypeID::kU16:
+    tensor_fill<uint16_t>(*this, static_cast<uint16_t>(val));
+    break;
+
+  case library::NumericTypeID::kU32:
+    tensor_fill<uint32_t>(*this, static_cast<uint32_t>(val));
+    break;
+
+  case library::NumericTypeID::kU64:
+    tensor_fill<uint64_t>(*this, static_cast<uint64_t>(val));
+    break;
+
+  case library::NumericTypeID::kCF16:
+    tensor_fill<cutlass::complex<half_t> >(*this, from_real<half_t>(val));
+    break;
+
+  case library::NumericTypeID::kCF32:
+    tensor_fill<cutlass::complex<float> >(*this, from_real<float>(val));
+    break;
+
+  case library::NumericTypeID::kCF64:
+    tensor_fill<cutlass::complex<double> >(*this, from_real<double>(val));
+    break;
+
+  default:
+    throw std::runtime_error("Unsupported numeric type");
+  }
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace profiler
--- a/tools/profiler/src/device_allocation.h
+++ b/tools/profiler/src/device_allocation.h
@ -64,7 +64,7 @@ private:
  library::LayoutTypeID layout_;

  /// Stride vector
-  std::vector<int> stride_;
+  std::vector<int64_t> stride_;

  /// Extent vector
  std::vector<int> extent_;
@ -84,7 +84,7 @@ public:
  static size_t bytes(library::NumericTypeID type, size_t capacity);

  /// Returns the stride of a packed layout
-  static std::vector<int> get_packed_layout(
+  static std::vector<int64_t> get_packed_layout(
    library::LayoutTypeID layout_id, 
    std::vector<int> const &extent);

@ -93,7 +93,7 @@ public:
    void *bytes,
    library::LayoutTypeID layout_id,
    std::vector<int> const &extent,
-    std::vector<int> &stride);
+    std::vector<int64_t> &stride);

  /// Returns true if two blocks have exactly the same value
  static bool block_compare_equal(
@ -124,7 +124,7 @@ public:
    library::NumericTypeID type, 
    library::LayoutTypeID layout_id, 
    std::vector<int> const &extent, 
-    std::vector<int> const &stride = std::vector<int>(),
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

  ~DeviceAllocation();
@ -139,7 +139,7 @@ public:
    library::NumericTypeID type, 
    library::LayoutTypeID layout_id, 
    std::vector<int> const &extent, 
-    std::vector<int> const &stride = std::vector<int>(),
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

  /// Returns a buffer owning the tensor reference
@ -162,7 +162,7 @@ public:
  library::LayoutTypeID layout() const;

  /// Gets the stride vector
-  std::vector<int> const & stride() const;
+  std::vector<int64_t> const & stride() const;

  /// Gets the extent vector
  std::vector<int> const & extent() const;
@ -193,6 +193,9 @@ public:

  /// Initializes a host allocation to a random distribution using std::cout
  void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
+  
+  /// Uniformly fills a tensor with a value when provided o.w. zero
+  void fill(double value);

  /// Copies from an equivalent-sized tensor in device memory
  void copy_from_device(void const *ptr);
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@ -52,7 +52,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
  library::NumericTypeID type, 
  library::LayoutTypeID layout_id, 
  std::vector<int> const &extent, 
-  std::vector<int> const &stride,
+  std::vector<int64_t> const &stride,
  int batch_count) {

  device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
@ -69,7 +69,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
  library::NumericTypeID type, 
  library::LayoutTypeID layout_id, 
  std::vector<int> const &extent, 
-  std::vector<int> const &stride,
+  std::vector<int64_t> const &stride,
  int batch_count) {

  DeviceAllocation *allocation = 
@ -133,7 +133,7 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
  library::LayoutTypeID layout_id, 
  library::NumericTypeID type_a,
  std::vector<int> const &extent, 
-  std::vector<int> const &stride,
+  std::vector<int64_t> const &stride,
  int batch_count) {

  DeviceAllocation *allocation = 
--- a/tools/profiler/src/device_context.h
+++ b/tools/profiler/src/device_context.h
@ -77,7 +77,7 @@ public:
    library::NumericTypeID type, 
    library::LayoutTypeID layout_id, 
    std::vector<int> const &extent, 
-    std::vector<int> const &stride = std::vector<int>(),
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

  /// Allocates memory of a given type, capacity (elements), and name
@ -87,7 +87,7 @@ public:
    library::NumericTypeID type, 
    library::LayoutTypeID layout_id, 
    std::vector<int> const &extent, 
-    std::vector<int> const &stride = std::vector<int>(),
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

  /// Allocates memory for sparse meta data 
@ -98,7 +98,7 @@ public:
    library::LayoutTypeID layout_id, 
    library::NumericTypeID type_a,
    std::vector<int> const &extent, 
-    std::vector<int> const &stride = std::vector<int>(),
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

  /// Clears named allocations (but does not necessarily free memory)