cutlass/cutlass/shape.h

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*! \file
    \brief Defines Shape implementing the Layout concept for representing a 4D hypercube of objects.
*/
#pragma once

#include <cutlass/cutlass.h>

namespace cutlass {

////////////////////////////////////////////////////////////////////////////////////////////////////

/*!@defgroup layout_concept Layout Concept
* @{
* @par Implementations of \ref layout_concept are used to describe a cube with DxHxW elements and C
scalars per element.
 A HxW slice of a cube is called an image and a cube consists of D images.
*
* @par Notations
*   Let Layout be an implementation of the \ref layout_concept.
*
* @par Valid Expressions
* - <b>Layout::D</b> specifies the depth of a cube
* - <b>Layout::H</b> specifies the height of a cube
* - <b>Layout::W</b> specifies the height of a cube
* - <b>Layout::C</b> specifies the number of channels of each element in a cube
* - <b>Layout::W_c</b> specifies the number of scalars of each row in one image of a cube.
* - <b>Layout::H_w</b> specifies the number of elements in an image slice.
* - <b>Layout::H_w_c</b>_specifies the number of scalars in an image slice.
* - <b>Layout::D_h_w</b> specifies the number of elements in a cube.
* - <b>Layout::D_h_w_c</b> specifies the number of scalars in a cube.
* - <b>Layout::Strides</b> is a \ref layout_concept specifying the strides.
* @}
*/

/**
* @brief A Shape implementing \ref layout_concept describing the dimensions of a cube.
* @concept{layout_concept}
*/
template <int kD_ = 1, int kH_ = 1, int kW_ = 1, int kC_ = 1>
struct Shape {
  /// The depth of the cube.
  static int const kD = kD_;
  /// The height of the cube.
  static int const kH = kH_;
  /// The width of the cube.
  static int const kW = kW_;
  /// The number of scalars per element.
  static int const kC = kC_;
};

/**
* @brief Compute derived counted of a \ref layout_concept based class
*/
template <typename Shape>
struct ShapeCount {
  /// The number of elements per row.
  static int const kWc = Shape::kW * Shape::kC;
  /// The number of pixels per image.
  static int const kHw = Shape::kH * Shape::kW;
  /// The number of elements per image.
  static int const kHwc = Shape::kH * kWc;
  /// The number of pixels per cube.
  static int const kDhw = Shape::kD * kHw;
  /// The number of elements in the 4D space.
  static int const kDhwc = Shape::kD * kHwc;
  /// The number of elements in the 4D space.
  static int const kCount = kDhwc;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, int kScale_>
struct ShapeScale {
  typedef Shape<A_::kD * kScale_, A_::kH * kScale_, A_::kW * kScale_, A_::kC * kScale_> Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeAdd {
  typedef Shape<A_::kD + B_::kD, A_::kH + B_::kH, A_::kW + B_::kW, A_::kC + B_::kC> Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeSub {
  typedef Shape<A_::kD - B_::kD, A_::kH - B_::kH, A_::kW - B_::kW, A_::kC - B_::kC> Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeMul {
  typedef Shape<A_::kD * B_::kD, A_::kH * B_::kH, A_::kW * B_::kW, A_::kC * B_::kC> Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeDiv {
  typedef Shape<A_::kD / B_::kD, A_::kH / B_::kH, A_::kW / B_::kW, A_::kC / B_::kC> Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeMax {
  typedef Shape<(A_::kD > B_::kD ? A_::kD : B_::kD),
                (A_::kH > B_::kH ? A_::kH : B_::kH),
                (A_::kW > B_::kW ? A_::kW : B_::kW),
                (A_::kC > B_::kC ? A_::kC : B_::kC)>
      Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename A_, typename B_>
struct ShapeMin {
  typedef Shape<(A_::kD < B_::kD ? A_::kD : B_::kD),
                (A_::kH < B_::kH ? A_::kH : B_::kH),
                (A_::kW < B_::kW ? A_::kW : B_::kW),
                (A_::kC < B_::kC ? A_::kC : B_::kC)>
      Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Shape_, int kElementsPerAccess>
struct ShapeStrides {
  typedef Shape<Shape_::kH * Shape_::kW * Shape_::kC,
                Shape_::kW * Shape_::kC,
                Shape_::kC,
                kElementsPerAccess>
      Shape;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube
* @tparam A \ref layout_concept where each dimension of the cube specifies the corresponding stride.
*/
template <typename Shape_>
struct ComputeOffsetFromShape {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) {
    // clang-format off
    return d * Shape_::kH * Shape_::kW * Shape_::kC +
           h * Shape_::kW * Shape_::kC +
           w * Shape_::kC +
           c;
    // clang-format on
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube with a depth of 1
* @tparam kSh Elements in the H dimension
* @tparam kSw Elements in the W dimension
* @tparam kSc Separation between two elements in "elements"
*/
template <int kSh_, int kSw_, int kSc_>
struct ComputeOffsetFromShape<Shape<1, kSh_, kSw_, kSc_> > {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) {
    return h * kSw_ * kSc_ + w * kSc_ + c;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube with one channel and a depth of 1
* @tparam kSh Elements in the H dimension
* @tparam kSw Elements in the W dimension
*/
template <int kSh_, int kSw_>
struct ComputeOffsetFromShape<Shape<1, kSh_, kSw_, 1> > {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) { return h * kSw_ + w; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube
* @tparam A \ref layout_concept where each dimension of the cube specifies the corresponding stride.
*/
template <typename Strides_>
struct ComputeOffsetFromStrides {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) {
    return d * Strides_::kD + h * Strides_::kH + w * Strides_::kW + c * Strides_::kC;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube with a depth of 1
* @tparam S_h Stride in the H dimension in scalars
* @tparam S_w Stride in the W dimension in scalars
* @tparam S_c Stride between two scalars.
*/
template <int S_h_, int S_w_, int S_c_>
struct ComputeOffsetFromStrides<Shape<1, S_h_, S_w_, S_c_> > {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) {
    return h * S_h_ + w * S_w_ + c * S_c_;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Compute the offset for the given coordinates in a cube with one channel and a depth of 1
* @tparam S_h Stride in the H dimension in scalars
* @tparam S_w Stride in the W dimension in scalars
*/
template <int S_h_, int S_w_>
struct ComputeOffsetFromStrides<Shape<1, S_h_, S_w_, 1> > {
  static CUTLASS_DEVICE int get(int d, int h, int w, int c) { return h * S_h_ + w * S_w_; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
* @brief Decompose threadId.x into coordinate of a cube whose dimensions are specified by Threads_.
* Afterwards compute the offset of those coordinates using Strides_
* @tparam Threads_ The dimension of the cube the threadIdx.x value is mapped on
* @tparam Strides_ The strides to use when compute the offsets based on the coordinates of the cube.
*/
template <typename Threads_, typename Strides_>
struct ComputeThreadOffsetFromStrides {
  static CUTLASS_DEVICE int get() {
    // Decompose the thread index.
    int c = threadIdx.x % Threads_::kC;
    int w = threadIdx.x / Threads_::kC % Threads_::kW;
    int h = threadIdx.x / Threads_::kC / Threads_::kW % Threads_::kH;
    int d = threadIdx.x / Threads_::kC / Threads_::kW / Threads_::kH;

    // Compute the offset.
    return d * Strides_::kD + h * Strides_::kH + w * Strides_::kW + c * Strides_::kC;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////
/**
*@brief Specialization for D=1
*/
template <int T_h_, int T_w_, int T_c_, int S_h_, int S_w_, int S_c_>
struct ComputeThreadOffsetFromStrides<Shape<1, T_h_, T_w_, T_c_>, Shape<1, S_h_, S_w_, S_c_> > {
  static CUTLASS_DEVICE int get() {
    // Decompose the thread index.
    int c = threadIdx.x % T_c_;
    int w = threadIdx.x / T_c_ % T_w_;
    int h = threadIdx.x / T_c_ / T_w_ % T_h_;

    // Compute the offset.
    return h * S_h_ + w * S_w_ + c * S_c_;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/**
*@brief Specialization for D=1 and C=1
*/
template <int T_h_, int T_w_, int S_h_, int S_w_>
struct ComputeThreadOffsetFromStrides<Shape<1, T_h_, T_w_, 1>, Shape<1, S_h_, S_w_, 1> > {
  static CUTLASS_DEVICE int get() {
    // Decompose the thread index.
    int w = threadIdx.x % T_w_;
    int h = threadIdx.x / T_w_;

    // Compute the offset.
    return h * S_h_ + w * S_w_;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace cutlass