195 lines
7.1 KiB
C++
195 lines
7.1 KiB
C++
/***************************************************************************************************
|
|
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/*! \file
|
|
\brief
|
|
|
|
This file contains definitions and utility functions for describing convolution problem sizes in terms of
|
|
activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
|
|
dilation (dilation_h, dilation_w). Furthermore, it defines helper functions to map CUTLASS's implicit gemm
|
|
tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
|
|
|
|
* Mapping convolutions to Gemm computation *
|
|
|
|
Cutlass implements convolutions with the Implicit Gemm algorithm. This algorithm performs a gemm
|
|
(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
|
|
The underlying gemm operation follows the standard gemm definition:
|
|
|
|
C = A * B + C
|
|
|
|
A and B are input matrices
|
|
C is source and output matrix
|
|
|
|
|
|
For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
|
|
to convolution tensors Activation, Filter and Output as described in the table below.
|
|
|
|
___________________________________________________________________________
|
|
ConvolutionalOperator | A | B | C
|
|
___________________________________________________________________________
|
|
| | | | |
|
|
| Fprop | Activation | Filter | Output |
|
|
| Dgrad | Output | Filter | Activation |
|
|
| Wgrad | Output | Activation | Filter |
|
|
___________________________________________________________________________
|
|
|
|
In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
|
|
|
|
For example, it's confusing and error prone to document a convolution class or function
|
|
as operating on "A, B, Output." Instead, use the mapping functions below,
|
|
and adhere to using either A, B, C or Activation, Filter, Output.
|
|
|
|
Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
|
|
Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "cutlass/cutlass.h"
|
|
#include "cutlass/layout/tensor.h"
|
|
#include "cutlass/tensor_coord.h"
|
|
#include "cutlass/fast_math.h"
|
|
#include "cutlass/gemm/gemm_enumerated_types.h"
|
|
#include "cutlass/matrix_coord.h"
|
|
|
|
namespace cutlass {
|
|
namespace conv {
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Convolutional operator
|
|
enum class Operator {
|
|
kFprop,
|
|
kDgrad,
|
|
kWgrad,
|
|
kDeconv
|
|
};
|
|
|
|
/// Distinguishes convolution from cross correlation
|
|
enum class Mode {
|
|
kCrossCorrelation,
|
|
kConvolution
|
|
};
|
|
|
|
/// Selects among several implementation variants trading off performance with simplicity
|
|
enum class IteratorAlgorithm {
|
|
kAnalytic, ///< functionally correct in all cases but lower performance
|
|
kOptimized, ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
|
|
kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
|
|
kFewChannels, ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
|
|
kFixedStrideDilation ///< Optimized for fixed stride and dilation
|
|
};
|
|
|
|
/// Distinguishes among partial specializations that accelerate certain problems where convolution
|
|
/// stride is unit.
|
|
enum class StrideSupport {
|
|
kStrided, ///< arbitrary convolution stride
|
|
kUnity, ///< unit convolution stride
|
|
kFixed ///< fixed convolution stride
|
|
};
|
|
|
|
/// Identifies split-K mode
|
|
enum class SplitKMode {
|
|
kNone,
|
|
kSerial,
|
|
kParallel
|
|
};
|
|
|
|
/// Identifies group mode
|
|
enum class GroupMode {
|
|
kNone,
|
|
kSingleGroup, ///< One CTA calculates one group or less
|
|
kMultipleGroup, ///< One CTA calculates multiple groups
|
|
kDepthwise ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Shape of a tensor
|
|
template <
|
|
int N = 1,
|
|
int H = 1,
|
|
int W = 1,
|
|
int C = 1
|
|
>
|
|
struct TensorNHWCShape {
|
|
static int const kN = N;
|
|
static int const kH = H;
|
|
static int const kW = W;
|
|
static int const kC = C;
|
|
|
|
static int const kHW = H * W;
|
|
static int const kNHW = N * kHW;
|
|
static int const kNHWC = N * H * W * C;
|
|
|
|
static int const kCount = kNHWC;
|
|
|
|
//
|
|
// Static member functions
|
|
//
|
|
|
|
/// Returns a Coord object
|
|
CUTLASS_HOST_DEVICE
|
|
static Coord<4> toCoord() {
|
|
return make_Coord(kN, kH, kW, kC);
|
|
}
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
|
|
template <
|
|
/// Stride in horizontal direction
|
|
int u = 1,
|
|
/// Stride in vertical direction
|
|
int v = 1
|
|
>
|
|
struct Stride2D {
|
|
static int const kU = u;
|
|
static int const kV = v;
|
|
|
|
//
|
|
// Static member functions
|
|
//
|
|
|
|
/// Returns a Coord object
|
|
CUTLASS_HOST_DEVICE
|
|
static Coord<2> toCoord() {
|
|
return make_Coord(kU, kV);
|
|
}
|
|
};
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
} // namespace conv
|
|
} // namespace cutlass
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|