CUTLASS 2.4 (Implicit GEMM convolution) (#147)

CUTLASS 2.4 (Implicit GEMM Convolution) Co-authored-by: Manish Gupta <manigupta@nvidia.com>, Haicheng Wu <haichengw@nvidia.com>, Dustyn Blasig <dblasig@nvidia.com>, Andrew Kerr <akerr@nvidia.com>
2020-11-19 21:25:25 -08:00
parent c2b80ad4e4
commit 6615010cd0
224 changed files with 43939 additions and 1061 deletions
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@ -62,6 +62,18 @@ void reorder_column(TensorRef<Element, Layout> dest,
  }
 }

+template <int Interleaved, typename Element, typename Layout>
+void reorder_convK(TensorRef<Element, Layout> dest,
+                    TensorRef<Element, Layout> src,
+                    cutlass::gemm::GemmCoord problem_size) {
+
+    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedDest(dest.data(), dest.stride(0));
+    TensorRef<Element, layout::RowMajorInterleaved<Interleaved>> mappedSrc(src.data(), src.stride(0));
+    
+    reorder_column<Interleaved>(
+        mappedDest, mappedSrc, problem_size);
+}
+
 /// This is needed for the sparse tensor core kernels.  The purpose
 /// is to use ldmatrix to load from shared memory to the register file.
 template <typename Element, typename LayoutDest, typename LayoutSrc>
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@ -0,0 +1,767 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Reference implementation for convolution in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Forward propagation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv2d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int c = 0; c < problem_size.C; ++c) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+
+                  ElementA a = tensor_x.at({n, h, w, c});
+                  ElementB b = tensor_w.at({k, r, s, c});
+
+                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
+          }
+
+          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+/// Depthwise-separable convolution
+template <typename ElementA,
+          typename LayoutA,
+          typename ElementB,
+          typename LayoutB,
+          typename ElementC,
+          typename LayoutC,
+          typename ElementAccumulator,
+          typename ElementCompute,
+          typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+          typename InnerProductOp = multiply_add<ElementAccumulator> >
+void Depsep_Fprop(
+                  cutlass::TensorView<ElementA, LayoutA> tensor_A,
+                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
+                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
+                  ElementCompute alpha,
+                  ElementCompute beta,
+                  cutlass::Tensor4DCoord padding,
+                  cutlass::Coord<2> conv_stride,
+                  cutlass::Coord<2> dilation,
+                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
+  
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < tensor_C.extent().n(); ++n) {
+    for (int p = 0; p < tensor_C.extent().h(); ++p) {
+      for (int q = 0; q < tensor_C.extent().w(); ++q) {
+        for (int g = 0; g < tensor_C.extent().c(); ++g) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < tensor_B.extent().h(); ++r) {
+            for (int s = 0; s < tensor_B.extent().w(); ++s) {
+              if ((p * conv_stride[0] - padding[0] + r * dilation[0]) < tensor_A.extent().h() &&
+                  (p * conv_stride[0] - padding[0] + r * dilation[0]) >= 0 &&
+                  (q * conv_stride[1] - padding[2] + s * dilation[1]) < tensor_A.extent().w() &&
+                  (q * conv_stride[1] - padding[2] + s * dilation[1]) >= 0) {
+                ElementA a = tensor_A.at(
+                    cutlass::make_Coord(n,
+                                        p * conv_stride[0] - padding[0] + r * dilation[0],
+                                        q * conv_stride[1] - padding[2] + s * dilation[1],
+                                        g));
+
+                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
+                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
+                                   : tensor_B.at(cutlass::make_Coord(
+                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
+
+                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
+          tensor_C.at(cutlass::make_Coord(n, p, q, g)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dDgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int h = 0; h < problem_size.H; ++h) {
+      for (int w = 0; w < problem_size.W; ++w) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int k = 0; k < problem_size.K; ++k) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                if (p >= 0 && (p % problem_size.stride_h) == 0 && 
+                    q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                  p = p / problem_size.stride_h;
+                  q = q / problem_size.stride_w;
+                  
+                  if (p < problem_size.P && q < problem_size.Q) {
+
+                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
+                    ElementB b = tensor_w.at(cutlass::make_Coord(k, r, s, c));
+
+                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                  }
+                }
+
+              } // for (K)
+            } // for (S)
+          } // for (R)
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
+          }
+
+          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (W)
+    } // for (H)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dWgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int r = 0; r < problem_size.R; ++r) {
+      for (int s = 0; s < problem_size.S; ++s) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int n = 0; n < problem_size.N; ++n) {
+            for (int p = 0; p < problem_size.P; ++p) {
+              for (int q = 0; q < problem_size.Q; ++q) {
+                  
+                cutlass::Tensor4DCoord b_coord;
+                
+                int filter_r = r;
+                int filter_s = s; 
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                b_coord = make_Coord(
+                    n,
+                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                    c);
+
+                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
+                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+                  acc = inner_product_op(a, b, acc);
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
+          }
+
+          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (K)
+}
+
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDgrad:
+    Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3D convolution 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv3d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int z = 0; z < problem_size.Z; ++z) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int c = 0; c < problem_size.C; ++c) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                    if (d >= 0 && d < problem_size.D && 
+                      h >=0 && h < problem_size.H && 
+                      w >= 0 && w < problem_size.W) {
+
+                      ElementA a = tensor_x.at({n, d, h, w, c});
+                      ElementB b = tensor_w.at({k, t, r, s, c});
+                      
+                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
+            }
+
+            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dDgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int d = 0; d < problem_size.D; ++d) {
+      for (int h = 0; h < problem_size.H; ++h) {
+        for (int w = 0; w < problem_size.W; ++w) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int k = 0; k < problem_size.K; ++k) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
+                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
+                        p >= 0 && (p % problem_size.stride_h) == 0 && 
+                        q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                      z = z / problem_size.stride_d;
+                      p = p / problem_size.stride_h;
+                      q = q / problem_size.stride_w;
+                      
+                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+
+                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
+                        ElementB b = tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
+
+                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                      }
+                    }
+
+                  } // for (K)
+                } // for (S)
+              } // for (R)
+            } // for (T)
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
+            }
+
+            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (W)
+      } // for (H)
+    } // for (D)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dWgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int t = 0; t < problem_size.T; ++t) {
+      for (int r = 0; r < problem_size.R; ++r) {
+        for (int s = 0; s < problem_size.S; ++s) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int n = 0; n < problem_size.N; ++n) {
+              for (int z = 0; z < problem_size.Z; ++z) {
+                for (int p = 0; p < problem_size.P; ++p) {
+                  for (int q = 0; q < problem_size.Q; ++q) {
+                      
+                    int filter_t = t;     
+                    int filter_r = r;
+                    int filter_s = s; 
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    Tensor5DCoord b_coord = make_Coord(
+                        n,
+                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
+                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                        c);
+
+                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
+                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
+                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+
+                      acc = inner_product_op(a, b, acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
+            }
+
+            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (S)
+      } // for (R)
+    } // for (T)
+  } // for (K)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDgrad:
+    Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace host
+}  // namespace reference
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@ -249,6 +249,45 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,

 ////////////////////////////////////////////////////////////////////////////////////////////////////

+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastBF16> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for multiply-add-saturate
 template <typename ElementA, typename LayoutA, typename ElementB,
          typename LayoutB, typename ElementC, typename LayoutC,