b2b bias vector support (#482)

* b2b bias vector support * add files Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-04-30 07:16:15 -04:00
parent 86ce09aed1
commit ec2b4fd85d
34 changed files with 1096 additions and 324 deletions
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@ -82,9 +82,7 @@ __global__ void Conv2dFprop(
  TensorRef<ElementC, LayoutC> tensor_y_in,
  TensorRef<ElementC, LayoutC> tensor_y_out,
  ElementCompute alpha,
-  ElementCompute beta,
-  TensorRef<ElementCompute, layout::RowMajor> tensor_scale,
-  TensorRef<ElementCompute, layout::RowMajor> tensor_bias
+  ElementCompute beta
  ) {

  ConvertOp convert_op;
@ -186,26 +184,13 @@ __global__ void Conv2dFprop(
        int thread_k = k_start + n;
        if (thread_k < problem_size.K) {

-          if(alpha == ElementCompute()) { // use per-channel scale and bias
-            ElementCompute scale = tensor_scale.at({0, thread_k});
-            ElementCompute bias = tensor_bias.at({0, thread_k});
-            tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-              scale * ElementCompute(accum[m][n]) + bias);
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
          }
-          else if(tensor_bias.good()) { // use per-channel bias
-            ElementCompute bias = tensor_bias.at({0, thread_k});
-            tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-              alpha * ElementCompute(accum[m][n]) + bias);
-          }
-          else {
-            ElementCompute c_ref = ElementCompute();
-            if (beta != ElementCompute()) {
-              c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
-            }

-            tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-              alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-          }
+          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
        }
      } 
    }
@ -1015,9 +1000,7 @@ Status Conv2dFprop(
  TensorRef<ElementC, LayoutC> tensor_y_out,
  ElementCompute alpha,
  ElementCompute beta,
-  cudaStream_t stream = nullptr,
-  TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
-  TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
+  cudaStream_t stream = nullptr) {

  //
  // Blocking factors improve performance of reference implementation
@ -1056,9 +1039,7 @@ Status Conv2dFprop(
    tensor_y_in,
    tensor_y_out,
    alpha,
-    beta,
-    tensor_scale,
-    tensor_bias
+    beta
  );

  cudaError_t result = cudaPeekAtLastError();
@ -1448,9 +1429,7 @@ Status Conv2d(
  TensorRef<ElementC, LayoutC> tensor_D,
  ElementCompute alpha,
  ElementCompute beta,
-  cudaStream_t stream = nullptr,
-  TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
-  TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
+  cudaStream_t stream = nullptr) {
  
  switch (convolutional_operator) {
  case conv::Operator::kFprop:
@ -1461,7 +1440,7 @@ Status Conv2d(
      ElementCompute,
      ElementAccumulator,
      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream, tensor_scale, tensor_bias);
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
    break;

  case conv::Operator::kDgrad: