CUTLASS 2.7 (#318)
CUTLASS 2.7 Mainloop fusion for GEMM: summation over A or B Strided DGRAD (optimized iterators) Half-precision GELU_taylor activation functions Use these when accumulation and epilogue compute types are all cutlass::half_t Tuning and bug fixes to fused GEMM + GEMM example Support for smaller than 128b aligned Convolutions: see examples Caching of results to accelerate Convolution unit tests Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF Corrections and bug fixes reported by the CUTLASS community Thank you for filing these issues! authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
This commit is contained in:
@ -47,6 +47,8 @@
|
||||
#include "cutlass/core_io.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "cache_testbed_output.h"
|
||||
|
||||
namespace test {
|
||||
namespace conv {
|
||||
namespace device {
|
||||
@ -116,7 +118,6 @@ public:
|
||||
cutlass::Distribution::Kind dist_kind,
|
||||
uint64_t seed) {
|
||||
|
||||
//cutlass::reference::host::TensorFill(view, Element(1.0f));
|
||||
if (dist_kind == cutlass::Distribution::Uniform) {
|
||||
|
||||
int scope;
|
||||
@ -126,7 +127,12 @@ public:
|
||||
scope = 2;
|
||||
}
|
||||
else if (bits == 16) {
|
||||
scope = 3;
|
||||
if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
|
||||
scope = 3;
|
||||
}
|
||||
else {
|
||||
scope = 5;
|
||||
}
|
||||
}
|
||||
else {
|
||||
scope = 8;
|
||||
@ -137,6 +143,7 @@ public:
|
||||
else if (dist_kind == cutlass::Distribution::Identity) {
|
||||
|
||||
cutlass::reference::host::TensorFillIdentity(view);
|
||||
|
||||
}
|
||||
else if (dist_kind == cutlass::Distribution::Gaussian) {
|
||||
|
||||
@ -321,6 +328,50 @@ public:
|
||||
|
||||
tensor_D_computed.sync_host();
|
||||
|
||||
//
|
||||
// Reference check - support caching results
|
||||
//
|
||||
|
||||
CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
>(
|
||||
kConvolutionalOperator,
|
||||
problem_size,
|
||||
alpha,
|
||||
beta,
|
||||
tensor_A.host_view(),
|
||||
tensor_B.host_view(),
|
||||
tensor_C.host_view()
|
||||
);
|
||||
|
||||
//
|
||||
// Look for the cached key
|
||||
//
|
||||
|
||||
bool cached_result_loaded = false;
|
||||
CachedTestResult cached_test_result;
|
||||
|
||||
std::string conv2d_result_cache_name =
|
||||
std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
|
||||
|
||||
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
|
||||
|
||||
CachedTestResultListing cached_results(conv2d_result_cache_name);
|
||||
|
||||
auto cached = cached_results.find(cached_test_key);
|
||||
|
||||
cached_result_loaded = cached.first;
|
||||
if (cached_result_loaded) {
|
||||
cached_test_result = cached.second;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cached_result_loaded) {
|
||||
|
||||
#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
|
||||
|
||||
cutlass::reference::device::Conv2d<
|
||||
@ -367,9 +418,32 @@ public:
|
||||
beta);
|
||||
|
||||
#endif
|
||||
passed = cutlass::reference::host::TensorEquals(
|
||||
tensor_D_computed.host_view(),
|
||||
tensor_D_reference.host_view());
|
||||
|
||||
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
|
||||
|
||||
cached_test_result.D = TensorHash(tensor_D_reference.host_view());
|
||||
|
||||
CachedTestResultListing cached_results(conv2d_result_cache_name);
|
||||
|
||||
cached_results.append(cached_test_key, cached_test_result);
|
||||
cached_results.write(conv2d_result_cache_name);
|
||||
}
|
||||
} // if (!cached_result_loaded)
|
||||
|
||||
uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
|
||||
|
||||
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
|
||||
passed = (tensor_D_hash == cached_test_result.D);
|
||||
|
||||
EXPECT_EQ(tensor_D_hash, cached_test_result.D)
|
||||
<< "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
|
||||
}
|
||||
else {
|
||||
|
||||
passed = cutlass::reference::host::TensorEquals(
|
||||
tensor_D_computed.host_view(),
|
||||
tensor_D_reference.host_view());
|
||||
}
|
||||
|
||||
EXPECT_TRUE(passed);
|
||||
|
||||
@ -416,9 +490,18 @@ public:
|
||||
results
|
||||
<< "\nA:\n" << tensor_A.host_view() << "\n"
|
||||
<< "\nB:\n" << tensor_B.host_view() << "\n"
|
||||
<< "\nC:\n" << tensor_C.host_view() << "\n"
|
||||
<< "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
|
||||
<< "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
|
||||
<< "\nC:\n" << tensor_C.host_view() << "\n";
|
||||
|
||||
results << "\nD reference (hash: " << cached_test_result.D << ")\n";
|
||||
|
||||
if (!cached_result_loaded) {
|
||||
results
|
||||
<< tensor_D_reference.host_view() << "\n";
|
||||
}
|
||||
|
||||
results
|
||||
<< "\nD computed (hash: " << tensor_D_hash << ")\n"
|
||||
<< tensor_D_computed.host_view() << "\n";
|
||||
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user