Updates to fused epilogue (#383)

* Enhancements and fixes to fused GEMM and Convolution epilogue.
* Need to explicitly list cudart as unit test library dependency.
This commit is contained in:
Andrew Kerr
2021-12-17 16:04:43 -05:00
committed by GitHub
parent 4e666e1dfd
commit ec4f7e5194
24 changed files with 372 additions and 193 deletions

View File

@ -99,6 +99,8 @@ public:
cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
int tested_problem_count;
public:
TestbedConv2d(
@ -107,7 +109,7 @@ public:
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {
}
@ -220,7 +222,10 @@ public:
return true;
}
#if 0 //display conv2d problem size for debugging
// increment tested problem count run by the testbed
tested_problem_count++;
#if 0 // display conv2d problem size for debugging
std::cout << problem_size << std::endl
<< "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
<< "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
@ -537,78 +542,96 @@ bool TestAllConv2d(
// Vector of conv2d problem sizes to avoid duplicate runs
Conv2dProblemVector conv_tested_sizes;
Conv2dProblemVector const *problem_vectors[] = {
&conv_test_sizes, // run user specified sizes
&conv_problems.conv2d_default_sizes, // run default and cudnn bug sizes
//&conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes
// Vectors of Conv2dProblemVector (lenient/easiest to rigorous problem sizes)
std::vector<Conv2dProblemVector> problem_vectors = {
conv_test_sizes, // run user specified sizes
conv_problems.conv2d_default_sizes, // run default and cudnn bug sizes
//conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes
#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
&conv_problems.conv2d_rigorous_sizes, // run large and rigorous sizes if enabled
conv_problems.conv2d_rigorous_sizes, // run large and rigorous sizes if enabled
#endif
};
// Flatten 2D problem_vectors into a 1D problem_sizes
std::vector<cutlass::conv::Conv2dProblemSize> problem_sizes;
for (auto problem_vector : problem_vectors) {
for(auto conv_problem : problem_vector) {
problem_sizes.push_back(conv_problem);
}
}
// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reverse the order (rigorous to lenient)
// run the most rigorous problem size first
if (CutlassUnitTestProblemCount()) {
std::reverse(problem_sizes.begin(), problem_sizes.end());
}
// Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
for (Conv2dProblemVector const * problem_vector : problem_vectors) {
for(auto conv_problem : problem_sizes) {
// Run conv testbed on default convolution sizes
for(auto conv_problem : *problem_vector) {
// Skip blacklist and avoid duplicate problem sizes
if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
continue;
}
// Skip blacklist and avoid duplicate problem sizes
if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
//
// Procedurally disable certain cases
//
// CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kUnity)) {
if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
continue;
}
}
//
// Procedurally disable certain cases
//
// CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
// Although strided dgrad works for all stride combinations, we are only going
// to run strided dgrad for non-unity strides
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kStrided)) {
if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
continue;
}
}
//
// Test
//
// push back tested problem size to avoid re-running duplicates
conv_tested_sizes.push_back(conv_problem);
// test mode = xcross
passed = testbed.run(
conv_problem,
cutlass::conv::SplitKMode::kSerial);
// CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kUnity)) {
if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
continue;
}
}
// CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
// Although strided dgrad works for all stride combinations, we are only going
// to run strided dgrad for non-unity strides
if ((ImplicitGemm::kConvolutionalOperator ==
cutlass::conv::Operator::kDgrad) &&
(ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport ==
cutlass::conv::StrideSupport::kStrided)) {
if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
continue;
}
}
//
// Test
//
// push back tested problem size to avoid re-running duplicates
conv_tested_sizes.push_back(conv_problem);
// test mode = xcross
passed = testbed.run(
conv_problem,
cutlass::conv::SplitKMode::kSerial);
if (!passed) {
return false;
}
if (!passed) {
return false;
}
// test mode = convolution
passed = testbed.run(
conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
cutlass::conv::SplitKMode::kSerial);
if (!passed) {
return false;
}
// test mode = convolution
passed = testbed.run(
conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
cutlass::conv::SplitKMode::kSerial);
if (!passed) {
return false;
}
// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
if (CutlassUnitTestProblemCount() &&
testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
return true;
}
}
// CUTLASS DGRAD's *strided* specialization does not support split-k mode
if ((ImplicitGemm::kConvolutionalOperator ==
@ -677,6 +700,12 @@ bool TestAllConv2d(
if (!passed) {
return false;
}
// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
if (CutlassUnitTestProblemCount() &&
testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
return true;
}
}
}
}