CUTLASS 2.6 (#298)

CUTLASS 2.6
This commit is contained in:
Manish Gupta
2021-07-22 21:40:53 -07:00
committed by GitHub
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions

View File

@ -81,7 +81,7 @@ public:
>;
using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
public:
@ -161,7 +161,7 @@ public:
initialize_tensor(tensor_A.host_view(), init_A, seed);
initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
tensor_A.sync_device();
tensor_B.sync_device();
tensor_C.sync_device();
@ -214,7 +214,7 @@ public:
#if 0 //display conv2d problem size for debugging
std::cout << problem_size << std::endl
<< "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
<< "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
<< "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
<< std::endl;
#endif
@ -262,7 +262,7 @@ public:
if (status != cutlass::Status::kSuccess) {
return false;
}
// run conv2d operator
status = conv2d_op();
@ -271,6 +271,7 @@ public:
return false;
}
if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
// configure parallel reduction operator
@ -280,10 +281,20 @@ public:
cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
problem_size.split_k_slices,
cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
{reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
{tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
{tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
{alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C
{
reinterpret_cast<ElementAccumulator*> (workspace.get()),
ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
},
{
tensor_D_computed.device_data(),
ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
},
{
tensor_C.device_data(),
ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
},
// apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C
{alpha, beta}
);
status = reduction_op.initialize(reduction_args, nullptr);
@ -302,7 +313,11 @@ public:
}
}
bool passed = false;
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess) << " device reference error: "
<< cudaGetErrorString(result);
tensor_D_computed.sync_host();
#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
@ -326,10 +341,6 @@ public:
alpha,
beta);
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess) << " device reference error: "
<< cudaGetErrorString(result);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D_reference.sync_host();
@ -445,7 +456,7 @@ bool TestAllConv2d(
Conv2dProblemVector const *problem_vectors[] = {
&conv_test_sizes, // run user specified sizes
&conv_problems.conv2d_default_sizes, // run default and cudnn bug sizes
&conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes
//&conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes
#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
&conv_problems.conv2d_rigorous_sizes, // run large and rigorous sizes if enabled
#endif
@ -467,7 +478,7 @@ bool TestAllConv2d(
// Procedurally disable certain cases
//
// CUTLASS DGRAD's unity stride specialization only support stride {1, 1}
// CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
@ -477,6 +488,18 @@ bool TestAllConv2d(
}
}
// CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
// Although strided dgrad works for all stride combinations, we are only going
// to run strided dgrad for non-unity strides
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kStrided)) {
if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
continue;
}
}
//
// Test
//
@ -491,7 +514,7 @@ bool TestAllConv2d(
if (!passed) {
return false;
}
// test mode = convolution
passed = testbed.run(
conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
@ -503,6 +526,30 @@ bool TestAllConv2d(
}
}
// CUTLASS DGRAD's *strided* specialization does not support split-k mode
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kStrided)) {
passed = testbed.run(
cutlass::conv::Conv2dProblemSize(
{1, 56, 56, 8}, // input size (NHWC)
{8, 1, 1, 8}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{2, 2}, // stride (stride_h, stride_w)
{1, 1}), // dilation (dilation_h, dilation_w)
cutlass::conv::SplitKMode::kSerial,
cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0),
cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
if (!passed) {
return false;
}
return passed;
}
// Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep