Updates for 3.1 (#932)

2023-04-29 06:34:27 -07:00
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --provid
 cutlass_add_executable_tests(
  test_profiler cutlass_profiler
  DEPENDEES test_all
-  TEST_COMMAND_OPTIONS 
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
+  TEST_COMMAND_OPTIONS
+    GEMM
+    CONV2D
+    CONV3D
+    SPGEMM
+    RANK_K
+    RANK_2K
+    TRMM
+    SYMM
+  TEST_COMMAND_OPTIONS_PREFIX
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
  DISABLE_EXECUTABLE_INSTALL_RULE
  )

--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
    options_.execution_mode == ExecutionMode::kTrace) {

    // Profiles all operations
-    profile_();
+    return profile_();
  }
  else if (options_.execution_mode == ExecutionMode::kEnumerate) {
    // Enumerates all operations
@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {

      if (result) {
        return result;
-      } 
+      }
    }
  }

--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_device(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
  if (result != cudaSuccess) {
    throw std::runtime_error("Failed device-to-device copy");
@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_host(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
  if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed host-to-device copy");
  }
 }

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_to_host(void *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
  if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed device-to-host copy");
  }
 }

 void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
 }

 void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
  switch (type_) {
  case library::NumericTypeID::kFE4M3:
    cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
-      reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
      capacity_,
      seed,
      dist
@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
    break;
  case library::NumericTypeID::kFE5M2:
    cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
-      reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
      capacity_,
      seed,
      dist
@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
 }

 void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
 }

 void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
    return Status::kErrorInvalidProblem;
  }

+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
  if (!arg_as_scalar(
    this->alpha, 
    operation_desc.element_epilogue, 
@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
  set_argument(result, "C", problem_space,
    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));

+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
  set_argument(result, "m", problem_space, m);
  set_argument(result, "n", problem_space, n);
  set_argument(result, "k", problem_space, k);
@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
      problem_.batch_count * gemm_workspace_.problem_count
    );

-    gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
-
    // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
    gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
    gemm_workspace_.arguments.batch_count = problem_.batch_count;
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
  ProblemSpace::Iterator problem_it = problem_space.begin();
  ProblemSpace::Iterator problem_end = problem_space.end();

-  bool continue_profiling = true, internal_error = false;
+  bool continue_profiling = true;
+  int retval = 0;

  // For each problem in problem space
  for (; continue_profiling && problem_it != problem_end; ++problem_it) {
-
    ProblemSpace::Problem problem = problem_it.at();
-
    report.next_problem();

    // For each operation in manifest
-    for (auto const & operation_ptr : manifest) {
+    int matched_operation_count = 0;
+    for (auto const& operation_ptr : manifest) {

      library::Operation const *operation = operation_ptr.get();

@ -292,8 +292,8 @@ int OperationProfiler::profile_all(

      // Execute compatible cutlass operations if they satisfy the current device's compute capability
      if (operation->description().kind == kind_ &&
-        operation->description().provider == library::Provider::kCUTLASS &&
-        options.device.compute_capability() >= min_cc &&
+          operation->description().provider == library::Provider::kCUTLASS &&
+          options.device.compute_capability() >= min_cc &&
          options.device.compute_capability() <= max_cc) {

        std::string operation_name(operation->description().name);
@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
        if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
          continue;
        }
-      
+
+        // we have found a kernel match, so increment the counter for match kernels
+        ++matched_operation_count;
+
        // A. Initialize configuration
        Status status = this->initialize_configuration(
          options,
@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
        //

        // B. Verify CUTLASS
-         
        if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {

          continue_profiling = this->verify_cutlass(
@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
      if (!continue_profiling) {
        break;
      }
-    } 
+    }
+
+    // If we did not find any kernels that match our filters and error_on_no_match was set, report an error
+    if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
+      #if !NDEBUG
+      std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
+      #endif
+      retval = 1;
+    }
  }

-  return internal_error ? 1 : 0;
+  return retval;
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
  }
  else if (cmdline.check_cmd_line_flag("kernels")) {
    cmdline.get_cmd_line_arguments("kernels", operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
  }

  if (cmdline.check_cmd_line_flag("ignore-kernels")) {
    cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
  }

  // Prevent launches on the device for anything other than CUTLASS operation
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@ -196,6 +196,9 @@ public:
    /// If true, profiling is actually conducted.
    bool enabled;

+    /// If true, profiling returns an error code if no kernels are found to match the filters.
+    bool error_on_no_match = false;
+
    /// List of providers of each functionality to be profiled
    ProviderVector providers;