CUTLASS 2.1 (#83)

CUTLASS 2.1 contributes: - BLAS-style host-side API added to CUTLASS Library - Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores - Minor enhancements and bug fixes
2020-04-07 13:51:25 -07:00
parent 7c0cd26d13
commit 96dab34ad9
196 changed files with 20653 additions and 1995 deletions
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@ -29,7 +29,7 @@ provided by CUTLASS using tensor cores; which we run on a NVIDIA Volta GPU.

 Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
 high performance kernels at scale which works for multiple problem sizes with good abstractions is
-really hard. CUTLASS solves this problem by providing simplified abstractions (knobs) to compose
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
 multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
 easily.

@ -189,13 +189,27 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
                                         NumStages>;

 int main() {
+
+  // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1.
+  //
+  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+    std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+    return -1;
+  }
+
  cudaDeviceProp props;
-  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }

  if (!(props.major >= 7)) {
    std::cerr << "Volta Tensor Ops must be run on a machine with compute capability at least 70."
              << std::endl;
-    return 0;
+    return -1;
  }

  const int length_m = 5120;