b2b bias vector support (#482)

* b2b bias vector support

* add files

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
Haicheng Wu
2022-04-30 07:16:15 -04:00
committed by GitHub
parent 86ce09aed1
commit ec2b4fd85d
34 changed files with 1096 additions and 324 deletions

View File

@ -82,9 +82,7 @@ __global__ void Conv2dFprop(
TensorRef<ElementC, LayoutC> tensor_y_in,
TensorRef<ElementC, LayoutC> tensor_y_out,
ElementCompute alpha,
ElementCompute beta,
TensorRef<ElementCompute, layout::RowMajor> tensor_scale,
TensorRef<ElementCompute, layout::RowMajor> tensor_bias
ElementCompute beta
) {
ConvertOp convert_op;
@ -186,26 +184,13 @@ __global__ void Conv2dFprop(
int thread_k = k_start + n;
if (thread_k < problem_size.K) {
if(alpha == ElementCompute()) { // use per-channel scale and bias
ElementCompute scale = tensor_scale.at({0, thread_k});
ElementCompute bias = tensor_bias.at({0, thread_k});
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
scale * ElementCompute(accum[m][n]) + bias);
ElementCompute c_ref = ElementCompute();
if (beta != ElementCompute()) {
c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
}
else if(tensor_bias.good()) { // use per-channel bias
ElementCompute bias = tensor_bias.at({0, thread_k});
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
alpha * ElementCompute(accum[m][n]) + bias);
}
else {
ElementCompute c_ref = ElementCompute();
if (beta != ElementCompute()) {
c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
}
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
alpha * ElementCompute(accum[m][n]) + beta * c_ref);
}
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
alpha * ElementCompute(accum[m][n]) + beta * c_ref);
}
}
}
@ -1015,9 +1000,7 @@ Status Conv2dFprop(
TensorRef<ElementC, LayoutC> tensor_y_out,
ElementCompute alpha,
ElementCompute beta,
cudaStream_t stream = nullptr,
TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
cudaStream_t stream = nullptr) {
//
// Blocking factors improve performance of reference implementation
@ -1056,9 +1039,7 @@ Status Conv2dFprop(
tensor_y_in,
tensor_y_out,
alpha,
beta,
tensor_scale,
tensor_bias
beta
);
cudaError_t result = cudaPeekAtLastError();
@ -1448,9 +1429,7 @@ Status Conv2d(
TensorRef<ElementC, LayoutC> tensor_D,
ElementCompute alpha,
ElementCompute beta,
cudaStream_t stream = nullptr,
TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
cudaStream_t stream = nullptr) {
switch (convolutional_operator) {
case conv::Operator::kFprop:
@ -1461,7 +1440,7 @@ Status Conv2d(
ElementCompute,
ElementAccumulator,
ConvertOp, InnerProductOp
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream, tensor_scale, tensor_bias);
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
break;
case conv::Operator::kDgrad: