CUTLASS 2.7 (#318)

CUTLASS 2.7 Mainloop fusion for GEMM: summation over A or B Strided DGRAD (optimized iterators) Half-precision GELU_taylor activation functions Use these when accumulation and epilogue compute types are all cutlass::half_t Tuning and bug fixes to fused GEMM + GEMM example Support for smaller than 128b aligned Convolutions: see examples Caching of results to accelerate Convolution unit tests Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF Corrections and bug fixes reported by the CUTLASS community Thank you for filing these issues! authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
2021-09-20 11:02:22 -07:00
parent 9ac255863f
commit 2e07c4cc2f
62 changed files with 5611 additions and 186 deletions
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@ -47,6 +47,8 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"

+#include "cache_testbed_output.h"
+
 namespace test {
 namespace conv {
 namespace device {
@ -116,7 +118,6 @@ public:
    cutlass::Distribution::Kind dist_kind,
    uint64_t seed) {

-//cutlass::reference::host::TensorFill(view, Element(1.0f));
    if (dist_kind == cutlass::Distribution::Uniform) {

      int scope;
@ -126,7 +127,12 @@ public:
        scope = 2;
      }
      else if (bits == 16) {
-        scope = 3;
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
      }
      else {
        scope = 8;
@ -137,6 +143,7 @@ public:
    else if (dist_kind == cutlass::Distribution::Identity) {

      cutlass::reference::host::TensorFillIdentity(view);
+
    } 
    else if (dist_kind == cutlass::Distribution::Gaussian) {

@ -321,6 +328,50 @@ public:

    tensor_D_computed.sync_host();

+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+      >(
+        kConvolutionalOperator,
+        problem_size, 
+        alpha, 
+        beta, 
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv2d_result_cache_name = 
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    
+    if (!cached_result_loaded) {
+
 #if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED

    cutlass::reference::device::Conv2d<
@ -367,9 +418,32 @@ public:
      beta);

 #endif
-    passed = cutlass::reference::host::TensorEquals(
-      tensor_D_computed.host_view(), 
-      tensor_D_reference.host_view());
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+        tensor_D_reference.host_view());
+    }

    EXPECT_TRUE(passed);

@ -416,9 +490,18 @@ public:
      results
        << "\nA:\n" << tensor_A.host_view() << "\n"
        << "\nB:\n" << tensor_B.host_view() << "\n"
-        << "\nC:\n" << tensor_C.host_view() << "\n"
-        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
-        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+        << tensor_D_computed.host_view() << "\n";

    }