Cutlass 1.3 Release (#42)

CUTLASS 1.3 Release - Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
2019-03-20 10:49:17 -07:00
parent 19a9d64e3c
commit 877bdcace6
256 changed files with 16930 additions and 802 deletions
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/06_splitK_gemm/splitK_gemm.cu
+++ b/examples/06_splitK_gemm/splitK_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -84,11 +84,7 @@ cudaError_t cutlass_splitK_sgemm_nn(float const *A,
  typename deviceGemm::Params deviceGemmParams(m, n, k);

  // query if workspace is needed. the workspace size is sizeof(accumulateType) * M * N * splits_count
-  int workspace_size = deviceGemmParams.required_workspace_memory_in_byte();
-  if (workspace_size <= 0) {
-    std::cerr << "splitK workspace_size is smaller than 0" << std::endl;
-    return cudaErrorInvalidValue;
-  }
+  size_t workspace_size = deviceGemmParams.required_workspace_memory_in_byte();

  // allocate workspace memory
  float *workspace_ptr;