b2b bias vector support (#482)
* b2b bias vector support * add files Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
@ -82,9 +82,7 @@ __global__ void Conv2dFprop(
|
||||
TensorRef<ElementC, LayoutC> tensor_y_in,
|
||||
TensorRef<ElementC, LayoutC> tensor_y_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_scale,
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_bias
|
||||
ElementCompute beta
|
||||
) {
|
||||
|
||||
ConvertOp convert_op;
|
||||
@ -186,26 +184,13 @@ __global__ void Conv2dFprop(
|
||||
int thread_k = k_start + n;
|
||||
if (thread_k < problem_size.K) {
|
||||
|
||||
if(alpha == ElementCompute()) { // use per-channel scale and bias
|
||||
ElementCompute scale = tensor_scale.at({0, thread_k});
|
||||
ElementCompute bias = tensor_bias.at({0, thread_k});
|
||||
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
|
||||
scale * ElementCompute(accum[m][n]) + bias);
|
||||
ElementCompute c_ref = ElementCompute();
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
|
||||
}
|
||||
else if(tensor_bias.good()) { // use per-channel bias
|
||||
ElementCompute bias = tensor_bias.at({0, thread_k});
|
||||
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
|
||||
alpha * ElementCompute(accum[m][n]) + bias);
|
||||
}
|
||||
else {
|
||||
ElementCompute c_ref = ElementCompute();
|
||||
if (beta != ElementCompute()) {
|
||||
c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
|
||||
}
|
||||
|
||||
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
|
||||
alpha * ElementCompute(accum[m][n]) + beta * c_ref);
|
||||
}
|
||||
tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
|
||||
alpha * ElementCompute(accum[m][n]) + beta * c_ref);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1015,9 +1000,7 @@ Status Conv2dFprop(
|
||||
TensorRef<ElementC, LayoutC> tensor_y_out,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
cudaStream_t stream = nullptr,
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
//
|
||||
// Blocking factors improve performance of reference implementation
|
||||
@ -1056,9 +1039,7 @@ Status Conv2dFprop(
|
||||
tensor_y_in,
|
||||
tensor_y_out,
|
||||
alpha,
|
||||
beta,
|
||||
tensor_scale,
|
||||
tensor_bias
|
||||
beta
|
||||
);
|
||||
|
||||
cudaError_t result = cudaPeekAtLastError();
|
||||
@ -1448,9 +1429,7 @@ Status Conv2d(
|
||||
TensorRef<ElementC, LayoutC> tensor_D,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
cudaStream_t stream = nullptr,
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_scale = TensorRef<ElementCompute, layout::RowMajor>(),
|
||||
TensorRef<ElementCompute, layout::RowMajor> tensor_bias = TensorRef<ElementCompute, layout::RowMajor>() ) {
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
switch (convolutional_operator) {
|
||||
case conv::Operator::kFprop:
|
||||
@ -1461,7 +1440,7 @@ Status Conv2d(
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ConvertOp, InnerProductOp
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream, tensor_scale, tensor_bias);
|
||||
>(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
|
||||
break;
|
||||
|
||||
case conv::Operator::kDgrad:
|
||||
|
||||
Reference in New Issue
Block a user