CUTLASS 3.0.0 (#786)

* CUTLASS 3.0.0
2023-01-23 17:55:28 -08:00
parent 66d9cddc83
commit 277bd6e537
377 changed files with 76396 additions and 1186 deletions
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@ -241,8 +241,6 @@ struct SparseTestbed {
    // Determine SMEM requirements and waive if not satisfied
    //

-    int smem_size = int(sizeof(typename Mma::SharedStorage));
-
    cudaDeviceProp properties;
    int device_idx;
    cudaError_t result = cudaGetDevice(&device_idx);
@ -257,10 +255,6 @@ struct SparseTestbed {
      throw std::runtime_error("cudaGetDeviceProperties() failed");
    }

-    if (properties.sharedMemPerBlockOptin < smem_size) {
-      return false;
-    }
-
    return true;
  }

@ -415,7 +409,12 @@ struct SparseTestbed {
    bool passed = cutlass::reference::host::TensorEquals(
        matrix_C_computed.host_view(), matrix_C_reference.host_view());

-    EXPECT_TRUE(passed)
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
        << "A:\n" << matrix_A.host_view() << "\n"
        << "B:\n" << matrix_B.host_view() << "\n"
        << "E:\n" << matrix_E.host_view() << "\n"
@ -423,6 +422,7 @@ struct SparseTestbed {
        << matrix_C_reference.host_view() << "\n"
        << "Computed:\n"
        << matrix_C_computed.host_view() << "\n";
+    }

    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@ -193,11 +193,40 @@ struct Testbed {
    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
  }

+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    return true;
+  }
+
  /// Runs the test
  bool run(
      dim3 grid, dim3 block,
      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
    //
    // initialize device memory
    //
@ -318,13 +347,18 @@ struct Testbed {
    bool passed = cutlass::reference::host::TensorEquals(
        matrix_C_computed.host_view(), matrix_C_reference.host_view());

-    EXPECT_TRUE(passed) 
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
        << "A:\n" << matrix_A.host_view() << "\n"
        << "B:\n" << matrix_B.host_view() << "\n"
        << "Reference:\n"
        << matrix_C_reference.host_view() << "\n"
        << "Computed:\n"
        << matrix_C_computed.host_view() << "\n";
+    }

    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@ -217,11 +217,25 @@ struct Testbed {
    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
  }

+  bool sufficient() {
+    return true;
+  }
+
  /// Runs the test
  bool run(
      dim3 grid, dim3 block,
      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+
    //
    // initialize device memory
    //
@ -300,7 +314,7 @@ struct Testbed {

    cudaError_t result = cudaDeviceSynchronize();
    EXPECT_EQ(result, cudaSuccess)
-        << " kernel error: " << cudaGetErrorString(result);
+        << " kernel error: " << cudaGetErrorString(result) << " on device " << GetCudaDevice();

    matrix_C_computed.sync_host();

@ -316,7 +330,7 @@ struct Testbed {
    bool passed = cutlass::reference::host::TensorEquals(
        matrix_C_computed.host_view(), matrix_C_reference.host_view());

-    EXPECT_TRUE(passed);
+    EXPECT_TRUE(passed) << "Failed on device " << GetCudaDevice();

    if (!passed) {
      std::ofstream output("mma_pipelined_testbed_errors.txt");