CUTLASS 2.7 (#318)

CUTLASS 2.7

Mainloop fusion for GEMM: summation over A or B
Strided DGRAD (optimized iterators)
Half-precision GELU_taylor activation functions
Use these when accumulation and epilogue compute types are all cutlass::half_t
Tuning and bug fixes to fused GEMM + GEMM example
Support for smaller than 128b aligned Convolutions: see examples
Caching of results to accelerate Convolution unit tests
Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF
Corrections and bug fixes reported by the CUTLASS community
Thank you for filing these issues!

authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
This commit is contained in:
Manish Gupta
2021-09-20 11:02:22 -07:00
committed by GitHub
parent 9ac255863f
commit 2e07c4cc2f
62 changed files with 5611 additions and 186 deletions

View File

@ -47,6 +47,8 @@
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"
#include "cache_testbed_output.h"
namespace test {
namespace conv {
namespace device {
@ -116,7 +118,6 @@ public:
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
//cutlass::reference::host::TensorFill(view, Element(1.0f));
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
@ -126,7 +127,12 @@ public:
scope = 2;
}
else if (bits == 16) {
scope = 3;
if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
scope = 3;
}
else {
scope = 5;
}
}
else {
scope = 8;
@ -137,6 +143,7 @@ public:
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
@ -321,6 +328,50 @@ public:
tensor_D_computed.sync_host();
//
// Reference check - support caching results
//
CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
ElementAccumulator,
ElementCompute
>(
kConvolutionalOperator,
problem_size,
alpha,
beta,
tensor_A.host_view(),
tensor_B.host_view(),
tensor_C.host_view()
);
//
// Look for the cached key
//
bool cached_result_loaded = false;
CachedTestResult cached_test_result;
std::string conv2d_result_cache_name =
std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
CachedTestResultListing cached_results(conv2d_result_cache_name);
auto cached = cached_results.find(cached_test_key);
cached_result_loaded = cached.first;
if (cached_result_loaded) {
cached_test_result = cached.second;
}
}
if (!cached_result_loaded) {
#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
cutlass::reference::device::Conv2d<
@ -367,9 +418,32 @@ public:
beta);
#endif
passed = cutlass::reference::host::TensorEquals(
tensor_D_computed.host_view(),
tensor_D_reference.host_view());
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
cached_test_result.D = TensorHash(tensor_D_reference.host_view());
CachedTestResultListing cached_results(conv2d_result_cache_name);
cached_results.append(cached_test_key, cached_test_result);
cached_results.write(conv2d_result_cache_name);
}
} // if (!cached_result_loaded)
uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
passed = (tensor_D_hash == cached_test_result.D);
EXPECT_EQ(tensor_D_hash, cached_test_result.D)
<< "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
}
else {
passed = cutlass::reference::host::TensorEquals(
tensor_D_computed.host_view(),
tensor_D_reference.host_view());
}
EXPECT_TRUE(passed);
@ -416,9 +490,18 @@ public:
results
<< "\nA:\n" << tensor_A.host_view() << "\n"
<< "\nB:\n" << tensor_B.host_view() << "\n"
<< "\nC:\n" << tensor_C.host_view() << "\n"
<< "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
<< "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
<< "\nC:\n" << tensor_C.host_view() << "\n";
results << "\nD reference (hash: " << cached_test_result.D << ")\n";
if (!cached_result_loaded) {
results
<< tensor_D_reference.host_view() << "\n";
}
results
<< "\nD computed (hash: " << tensor_D_hash << ")\n"
<< tensor_D_computed.host_view() << "\n";
}