CUTLASS 2.4 (Implicit GEMM convolution) (#147)
CUTLASS 2.4 (Implicit GEMM Convolution) Co-authored-by: Manish Gupta <manigupta@nvidia.com>, Haicheng Wu <haichengw@nvidia.com>, Dustyn Blasig <dblasig@nvidia.com>, Andrew Kerr <akerr@nvidia.com>
This commit is contained in:
@ -62,6 +62,18 @@ void reorder_column(TensorRef<Element, Layout> dest,
|
||||
}
|
||||
}
|
||||
|
||||
template <int Interleaved, typename Element, typename Layout>
|
||||
void reorder_convK(TensorRef<Element, Layout> dest,
|
||||
TensorRef<Element, Layout> src,
|
||||
cutlass::gemm::GemmCoord problem_size) {
|
||||
|
||||
TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedDest(dest.data(), dest.stride(0));
|
||||
TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedSrc(src.data(), src.stride(0));
|
||||
|
||||
reorder_column<Interleaved>(
|
||||
mappedDest, mappedSrc, problem_size);
|
||||
}
|
||||
|
||||
/// This is needed for the sparse tensor core kernels. The purpose
|
||||
/// is to use ldmatrix to load from shared memory to the register file.
|
||||
template <typename Element, typename LayoutDest, typename LayoutSrc>
|
||||
|
||||
1536
tools/util/include/cutlass/util/reference/device/convolution.h
Normal file
1536
tools/util/include/cutlass/util/reference/device/convolution.h
Normal file
File diff suppressed because it is too large
Load Diff
767
tools/util/include/cutlass/util/reference/host/convolution.h
Normal file
767
tools/util/include/cutlass/util/reference/host/convolution.h
Normal file
@ -0,0 +1,767 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief Reference implementation for convolution in host-side code.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/coord.h"
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/tensor_view.h"
|
||||
#include "cutlass/conv/convolution.h"
|
||||
#include "cutlass/conv/conv2d_problem_size.h"
|
||||
#include "cutlass/conv/conv3d_problem_size.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace reference {
|
||||
namespace host {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Forward propagation
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// y = conv2d(x, w)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv2dFprop(
|
||||
conv::Conv2dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_x,
|
||||
TensorRef<ElementB, LayoutB> tensor_w,
|
||||
TensorRef<ElementC, LayoutC> tensor_y_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_y_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
InnerProductOp inner_product_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int p = 0; p < problem_size.P; ++p) {
|
||||
for (int q = 0; q < problem_size.Q; ++q) {
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
|
||||
int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
|
||||
|
||||
if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
|
||||
|
||||
ElementA a = tensor_x.at({n, h, w, c});
|
||||
ElementB b = tensor_w.at({k, r, s, c});
|
||||
|
||||
acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
|
||||
}
|
||||
|
||||
tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Depthwise-separable convolution
|
||||
template <typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementAccumulator,
|
||||
typename ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator> >
|
||||
void Depsep_Fprop(
|
||||
cutlass::TensorView<ElementA, LayoutA> tensor_A,
|
||||
cutlass::TensorView<ElementB, LayoutB> tensor_B,
|
||||
cutlass::TensorView<ElementC, LayoutC> tensor_C,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
cutlass::Tensor4DCoord padding,
|
||||
cutlass::Coord<2> conv_stride,
|
||||
cutlass::Coord<2> dilation,
|
||||
cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
InnerProductOp inner_product_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int n = 0; n < tensor_C.extent().n(); ++n) {
|
||||
for (int p = 0; p < tensor_C.extent().h(); ++p) {
|
||||
for (int q = 0; q < tensor_C.extent().w(); ++q) {
|
||||
for (int g = 0; g < tensor_C.extent().c(); ++g) {
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
for (int r = 0; r < tensor_B.extent().h(); ++r) {
|
||||
for (int s = 0; s < tensor_B.extent().w(); ++s) {
|
||||
if ((p * conv_stride[0] - padding[0] + r * dilation[0]) < tensor_A.extent().h() &&
|
||||
(p * conv_stride[0] - padding[0] + r * dilation[0]) >= 0 &&
|
||||
(q * conv_stride[1] - padding[2] + s * dilation[1]) < tensor_A.extent().w() &&
|
||||
(q * conv_stride[1] - padding[2] + s * dilation[1]) >= 0) {
|
||||
ElementA a = tensor_A.at(
|
||||
cutlass::make_Coord(n,
|
||||
p * conv_stride[0] - padding[0] + r * dilation[0],
|
||||
q * conv_stride[1] - padding[2] + s * dilation[1],
|
||||
g));
|
||||
|
||||
ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
|
||||
? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
|
||||
: tensor_B.at(cutlass::make_Coord(
|
||||
g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
|
||||
|
||||
acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
|
||||
tensor_C.at(cutlass::make_Coord(n, p, q, g)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Dgrad
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// dx = dgrad(dy, w)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv2dDgrad(
|
||||
cutlass::conv::Conv2dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_dy,
|
||||
TensorRef<ElementB, LayoutB> tensor_w,
|
||||
TensorRef<ElementC, LayoutC> tensor_dx_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_dx_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
InnerProductOp inner_product_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int h = 0; h < problem_size.H; ++h) {
|
||||
for (int w = 0; w < problem_size.W; ++w) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
|
||||
int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
|
||||
|
||||
if (p >= 0 && (p % problem_size.stride_h) == 0 &&
|
||||
q >= 0 && (q % problem_size.stride_w) == 0) {
|
||||
|
||||
p = p / problem_size.stride_h;
|
||||
q = q / problem_size.stride_w;
|
||||
|
||||
if (p < problem_size.P && q < problem_size.Q) {
|
||||
|
||||
ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
|
||||
ElementB b = tensor_w.at(cutlass::make_Coord(k, r, s, c));
|
||||
|
||||
acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
|
||||
}
|
||||
}
|
||||
|
||||
} // for (K)
|
||||
} // for (S)
|
||||
} // for (R)
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
|
||||
}
|
||||
|
||||
tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
|
||||
} // for (C)
|
||||
} // for (W)
|
||||
} // for (H)
|
||||
} // for (N)
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Wgrad
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// dw = wgrad(dy, x)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv2dWgrad(
|
||||
cutlass::conv::Conv2dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_dy,
|
||||
TensorRef<ElementB, LayoutB> tensor_x,
|
||||
TensorRef<ElementC, LayoutC> tensor_dw_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_dw_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
InnerProductOp inner_product_op;
|
||||
ConvertOp convert_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int p = 0; p < problem_size.P; ++p) {
|
||||
for (int q = 0; q < problem_size.Q; ++q) {
|
||||
|
||||
cutlass::Tensor4DCoord b_coord;
|
||||
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
b_coord = make_Coord(
|
||||
n,
|
||||
p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
|
||||
q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
|
||||
c);
|
||||
|
||||
if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
|
||||
b_coord.w() < problem_size.W && b_coord.w() >= 0) {
|
||||
|
||||
ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
|
||||
ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
|
||||
acc = inner_product_op(a, b, acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
|
||||
}
|
||||
|
||||
tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
|
||||
} // for (C)
|
||||
} // for (S)
|
||||
} // for (R)
|
||||
} // for (K)
|
||||
}
|
||||
|
||||
/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv2d(
|
||||
conv::Operator convolutional_operator,
|
||||
conv::Conv2dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_A,
|
||||
TensorRef<ElementB, LayoutB> tensor_B,
|
||||
TensorRef<ElementC, LayoutC> tensor_C,
|
||||
TensorRef<ElementC, LayoutC> tensor_D,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
switch (convolutional_operator) {
|
||||
case conv::Operator::kFprop:
|
||||
Conv2dFprop<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
case conv::Operator::kDgrad:
|
||||
Conv2dDgrad<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
case conv::Operator::kWgrad:
|
||||
Conv2dWgrad<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// 3D convolution
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// y = conv3d(x, w)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv3dFprop(
|
||||
conv::Conv3dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_x,
|
||||
TensorRef<ElementB, LayoutB> tensor_w,
|
||||
TensorRef<ElementC, LayoutC> tensor_y_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_y_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
InnerProductOp inner_product_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int z = 0; z < problem_size.Z; ++z) {
|
||||
for (int p = 0; p < problem_size.P; ++p) {
|
||||
for (int q = 0; q < problem_size.Q; ++q) {
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int t = 0; t < problem_size.T; ++t) {
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
int filter_t = t;
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_t = problem_size.T - 1 - t;
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
|
||||
int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
|
||||
int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
|
||||
|
||||
if (d >= 0 && d < problem_size.D &&
|
||||
h >=0 && h < problem_size.H &&
|
||||
w >= 0 && w < problem_size.W) {
|
||||
|
||||
ElementA a = tensor_x.at({n, d, h, w, c});
|
||||
ElementB b = tensor_w.at({k, t, r, s, c});
|
||||
|
||||
acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
|
||||
}
|
||||
|
||||
tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Dgrad
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// dx = dgrad(dy, w)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv3dDgrad(
|
||||
cutlass::conv::Conv3dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_dy,
|
||||
TensorRef<ElementB, LayoutB> tensor_w,
|
||||
TensorRef<ElementC, LayoutC> tensor_dx_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_dx_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
InnerProductOp inner_product_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int d = 0; d < problem_size.D; ++d) {
|
||||
for (int h = 0; h < problem_size.H; ++h) {
|
||||
for (int w = 0; w < problem_size.W; ++w) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int t = 0; t < problem_size.T; ++t) {
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
|
||||
int filter_t = t;
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_t = problem_size.T - 1 - t;
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
|
||||
int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
|
||||
int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
|
||||
|
||||
if (z >= 0 && (z % problem_size.stride_d) == 0 &&
|
||||
p >= 0 && (p % problem_size.stride_h) == 0 &&
|
||||
q >= 0 && (q % problem_size.stride_w) == 0) {
|
||||
|
||||
z = z / problem_size.stride_d;
|
||||
p = p / problem_size.stride_h;
|
||||
q = q / problem_size.stride_w;
|
||||
|
||||
if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
|
||||
|
||||
ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
|
||||
ElementB b = tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
|
||||
|
||||
acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
|
||||
}
|
||||
}
|
||||
|
||||
} // for (K)
|
||||
} // for (S)
|
||||
} // for (R)
|
||||
} // for (T)
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
|
||||
}
|
||||
|
||||
tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
|
||||
} // for (C)
|
||||
} // for (W)
|
||||
} // for (H)
|
||||
} // for (D)
|
||||
} // for (N)
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Wgrad
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// dw = wgrad(dy, x)
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv3dWgrad(
|
||||
cutlass::conv::Conv3dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_dy,
|
||||
TensorRef<ElementB, LayoutB> tensor_x,
|
||||
TensorRef<ElementC, LayoutC> tensor_dw_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_dw_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
InnerProductOp inner_product_op;
|
||||
ConvertOp convert_op;
|
||||
|
||||
// Apply MMA and accumulate ElementAccumulator
|
||||
for (int k = 0; k < problem_size.K; ++k) {
|
||||
for (int t = 0; t < problem_size.T; ++t) {
|
||||
for (int r = 0; r < problem_size.R; ++r) {
|
||||
for (int s = 0; s < problem_size.S; ++s) {
|
||||
for (int c = 0; c < problem_size.C; ++c) {
|
||||
|
||||
ElementAccumulator acc = ElementAccumulator();
|
||||
|
||||
for (int n = 0; n < problem_size.N; ++n) {
|
||||
for (int z = 0; z < problem_size.Z; ++z) {
|
||||
for (int p = 0; p < problem_size.P; ++p) {
|
||||
for (int q = 0; q < problem_size.Q; ++q) {
|
||||
|
||||
int filter_t = t;
|
||||
int filter_r = r;
|
||||
int filter_s = s;
|
||||
|
||||
if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
|
||||
filter_t = problem_size.T - 1 - t;
|
||||
filter_r = problem_size.R - 1 - r;
|
||||
filter_s = problem_size.S - 1 - s;
|
||||
}
|
||||
|
||||
Tensor5DCoord b_coord = make_Coord(
|
||||
n,
|
||||
z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
|
||||
p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
|
||||
q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
|
||||
c);
|
||||
|
||||
if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
|
||||
b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
|
||||
b_coord.w() < problem_size.W && b_coord.w() >= 0) {
|
||||
|
||||
ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
|
||||
ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
|
||||
|
||||
acc = inner_product_op(a, b, acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Epilogue, compute ElementCompute, convert and store ElementC
|
||||
ElementC c_ref = ElementC();
|
||||
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
|
||||
}
|
||||
|
||||
tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
|
||||
convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
|
||||
|
||||
} // for (C)
|
||||
} // for (S)
|
||||
} // for (R)
|
||||
} // for (T)
|
||||
} // for (K)
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
|
||||
template <
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementCompute,
|
||||
typename ElementAccumulator = ElementCompute,
|
||||
typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
|
||||
typename InnerProductOp = multiply_add<ElementAccumulator>
|
||||
>
|
||||
void Conv3d(
|
||||
conv::Operator convolutional_operator,
|
||||
conv::Conv3dProblemSize problem_size,
|
||||
TensorRef<ElementA, LayoutA> tensor_A,
|
||||
TensorRef<ElementB, LayoutB> tensor_B,
|
||||
TensorRef<ElementC, LayoutC> tensor_C,
|
||||
TensorRef<ElementC, LayoutC> tensor_D,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta) {
|
||||
|
||||
switch (convolutional_operator) {
|
||||
case conv::Operator::kFprop:
|
||||
Conv3dFprop<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
case conv::Operator::kDgrad:
|
||||
Conv3dDgrad<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
case conv::Operator::kWgrad:
|
||||
Conv3dWgrad<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace host
|
||||
} // namespace reference
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -249,6 +249,45 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization for multiply-add
|
||||
template <typename ElementA, typename LayoutA, typename ElementB,
|
||||
typename LayoutB, typename ElementC, typename LayoutC,
|
||||
typename ScalarType, typename ComputeType>
|
||||
struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
|
||||
ComputeType, arch::OpMultiplyAddFastBF16> {
|
||||
|
||||
void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
|
||||
TensorRef<ElementA, LayoutA> tensor_a,
|
||||
TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
|
||||
TensorRef<ElementC, LayoutC> tensor_c,
|
||||
ComputeType initial_accum = ComputeType(0)) {
|
||||
static_assert(
|
||||
LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
|
||||
"Tensors must be of rank 2");
|
||||
|
||||
compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
|
||||
ScalarType, ComputeType, multiply_add<ComputeType>>(
|
||||
problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
|
||||
}
|
||||
|
||||
void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
|
||||
TensorRef<ElementA, LayoutA> tensor_a,
|
||||
TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
|
||||
TensorRef<ElementC, LayoutC> tensor_c,
|
||||
TensorRef<ElementC, LayoutC> tensor_d,
|
||||
ComputeType initial_accum = ComputeType(0)) {
|
||||
static_assert(
|
||||
LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
|
||||
"Tensors must be of rank 2");
|
||||
|
||||
compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
|
||||
ScalarType, ComputeType, multiply_add<ComputeType>>(
|
||||
problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization for multiply-add-saturate
|
||||
template <typename ElementA, typename LayoutA, typename ElementB,
|
||||
typename LayoutB, typename ElementC, typename LayoutC,
|
||||
|
||||
Reference in New Issue
Block a user